Removing comments with a sliding window without nested while loops

前端 未结 3 1730
囚心锁ツ
囚心锁ツ 2021-01-06 10:50

I\'m trying to remove comments and strings from a c file with c code. I\'ll just stick to comments for the examples. I have a sliding window so I only have character n

相关标签:
3条回答
  • 2021-01-06 10:57

    The algorithm written with one while loop could look like this:

    while ((c = getchar()) != EOF)
    {
        ... // looking at the byte that was just read
    
        if (...) // the symbol is not inside a comment
        {
            putchar(c);
        }
    }
    

    To decide whether the input char belongs to a comment, you can use a state machine. In the following example, it has 4 states; there are also rules for traversing to next state.

    int state = 0;
    int next_state;
    while ((c = getchar()) != EOF)
    {
        switch (state)
        {
            case 0: next_state = (c == '/' ? 1 : 0); break;
            case 1: next_state = (c == '*' ? 2 : c == '/' ? 1 : 0); break;
            case 2: next_state = (c == '*' ? 3 : 2); break;
            case 3: next_state = (c == '/' ? 0 : c == '*' ? 3 : 2); break;
            default: next_state = state; // will never happen
        }
    
        if (state == 1 && next_state == 0)
        {
            putchar('/'); // for correct output when a slash is not followed by a star
        }
        if (state == 0 && next_state == 0)
        {
            putchar(c);
        }
        state = next_state;
    }
    

    The example above is very simple: it doesn't work correctly for /* in non-comment contexts like in C strings; it doesn't support // comments, etc.

    0 讨论(0)
  • 2021-01-06 11:00

    Doing this correctly is more complicated than one may at first think, as ably pointed out by the other comments here. I would strongly recommend writing a table-driven FSM, using a state transition diagram to get the transitions right. Trying to do anything more than a few states with case statements is horribly error-prone IMO.

    Here's a diagram in dot/graphviz format from which you could probably directly code a state table. Note that I haven't tested this at all, so YMMV.

    The semantics of the diagram are that when you see <ch>, it is a fall-though if none of the other input in that state match. End of file is an error in any state except S0, and so is any character not explicitly listed, or <ch>. Every character scanned is printed except when in a comment (S4 and S5), and when detecting a start comment (S1). You will have to buffer characters when detecting a start comment, and print them if it's a false start, otherwise throw them away when sure it's really a comment.

    In the dot diagram, sq is a single quote ', dq is a double quote ".

    digraph state_machine {
        rankdir=LR;
        size="8,5";
    
        node [shape=doublecircle]; S0 /* init */;
        node [shape=circle];
    
        S0  /* init */      -> S1  /* begin_cmt */ [label = "'/'"];
        S0  /* init */      -> S2  /* in_str */    [label = dq];
        S0  /* init */      -> S3  /* in_ch */     [label = sq];
        S0  /* init */      -> S0  /* init */      [label = "<ch>"];
        S1  /* begin_cmt */ -> S4  /* in_slc */    [label = "'/'"];
        S1  /* begin_cmt */ -> S5  /* in_mlc */    [label = "'*'"];
        S1  /* begin_cmt */ -> S0  /* init */      [label = "<ch>"];
        S1  /* begin_cmt */ -> S1  /* begin_cmt */ [label = "'\\n'"]; // handle "/\n/" and "/\n*"
        S2  /* in_str */    -> S0  /* init */      [label = "'\\'"];
        S2  /* in_str */    -> S6  /* str_esc */   [label = "'\\'"];
        S2  /* in_str */    -> S2  /* in_str */    [label = "<ch>"];
        S3  /* in_ch */     -> S0  /* init */      [label = sq];
        S4  /* in_slc */    -> S4  /* in_slc */    [label = "<ch>"];
        S4  /* in_slc */    -> S0  /* init */      [label = "'\\n'"];
        S5  /* in_mlc */    -> S7  /* end_mlc */   [label = "'*'"];
        S5  /* in_mlc */    -> S5  /* in_mlc */    [label = "<ch>"];
        S7  /* end_mlc */   -> S7  /* end_mlc */   [label = "'*'|'\\n'"];
        S7  /* end_mlc */   -> S0  /* init */      [label = "'/'"];
        S7  /* end_mlc */   -> S5  /* in_mlc */    [label = "<ch>"];
        S6  /* str_esc */   -> S8  /* oct */       [label = "[0-3]"];
        S6  /* str_esc */   -> S9  /* hex */       [label = "'x'"];
        S6  /* str_esc */   -> S2  /* in_str */    [label = "<ch>"];
        S8  /* oct */       -> S10 /* o1 */        [label = "[0-7]"];
        S10 /* o1 */        -> S2  /* in_str */    [label = "[0-7]"];
        S9  /* hex */       -> S11 /* h1 */        [label = hex];
        S11 /* h1 */        -> S2  /* in_str */    [label = hex];
        S3  /* in_ch */     -> S12 /* ch_esc */    [label = "'\\'"];
        S3  /* in_ch */     -> S13 /* out_ch */    [label = "<ch>"];
        S13 /* out_ch */    -> S0  /* init */      [label = sq];
        S12 /* ch_esc */    -> S3  /* in_ch */     [label = sq];
        S12 /* ch_esc */    -> S12 /* ch_esc */    [label = "<ch>"];
    }
    
    0 讨论(0)
  • 2021-01-06 11:08

    Since you only wish to use two characters for the buffer and only one while loop, I would suggest a third char to track your state (whether skipping text or not). I've put together a test program for you with inline comments explaining the logic:

    // Program to strip comments and strings from a C file
    //
    //  Build:
    //     gcc -o strip-comments strip-comments.c
    //
    //  Test:
    //     ./strip-comments strip-comments.c
    
    #include <stdio.h>
    #include <sys/types.h>
    #include <sys/uio.h>
    #include <fcntl.h>
    #include <unistd.h>
    #include <stdlib.h>
    
    /* The following is a block of strings, and comments for testing
     * the code.
     */
    /* test if three comments *//* chained together */// will be removed.
    static int value = 128 /* test comment within valid code *// 2;
    const char * test1 = "This is a test of \" processing"; /* testing inline comment */
    const char * test2 = "this is a test of \n within strings."; // testing inline comment
    // this is a the last test
    
    
    int strip_c_code(FILE * in, FILE * out)
    {
       char      buff[2];
       char      skipping;
    
       skipping = '\0';
       buff[0]  = '\0';
       buff[1]  = '\0';
    
       // loop through the file
       while((buff[0] =  fgetc(in)) != EOF)
       {
          // checking for start of comment or string block
          if (!(skipping))
          {
             // start skipping in "//"  comments
             if ((buff[1] == '/') && (buff[0] == '/'))
                skipping = '/';
    
             // start skipping in "/*"  comments
             else if ((buff[1] == '/') && (buff[0] == '*'))
                skipping = '*';
    
             // start skipping at start of strings, but not character assignments
             else if ( ((buff[1] != '\'') && (buff[0] == '"')) &&
                       ((buff[1] != '\\') && (buff[0] == '"')) )
             {
                fputc(buff[1], out);
                skipping = '"';
             };
    
             // clear buffer so that processed characters are not interpreted as
             // end of skip characters.
             if ((skipping))
             {
                buff[0] = '\0';
                buff[1] = '\0';
             };
          };
    
          // check for characters which terminate skip block
          switch(skipping)
          {
             // if skipping "//" comments, look for new line
             case '/':
             if (buff[1] == '\n')
                skipping = '\0';
             break;
    
             // if skipping "/*" comments, look for "*/" terminating string
             case '*':
             if ((buff[1] == '*') && (buff[0] == '/'))
             {
                buff[0]  = '\0';
                buff[1]  = '\0';
                skipping = '\0';
             };
             break;
    
             // if skipping strings, look for terminating '"' character
             case '"':
             if ((buff[1] != '\\') && (buff[0] == '"'))
             {
                skipping = '\0';
                buff[0]  = '\0';
                buff[1]  = '\0';
                fprintf(out, "NULL"); // replace string with NULL
             };
             break;
    
             default:
             break;
          };
    
          // if not skipping, write character out
          if ( (!(skipping)) && ((buff[1])) )
             fputc(buff[1], out);
    
          // shift new character to old character position
          buff[1] = buff[0];
       };
    
       // verify that the comment or string was terminated properly
       if ((skipping))
       {
          fprintf(stderr, "Unterminated comment or string\n");
          return(-1);
       };
    
       // write last character
       fputc(buff[1], out);
    
       return(0);
    }
    
    
    int main(int argc, char * argv[])
    {
       FILE * fs;
    
       if (argc != 2)
       {
          fprintf(stderr, "Usage: %s <filename>\n", argv[0]);
          return(1);
       };
    
       if ((fs = fopen(argv[1], "r")) == NULL)
       {
          perror("fopen()");
          return(1);
       };
    
       strip_c_code(fs, stdout);
    
       fclose(fs);
    
       return(0);
    }
    
    /* end of source file */
    

    I've also posted this code on Github to make it easier to download and compile:

    https://gist.github.com/syzdek/5417109

    0 讨论(0)
提交回复
热议问题