How to de-obfuscate the ctk.c code the winner of 2001's IOCCC?

前端 未结 1 541
广开言路
广开言路 2021-02-06 12:32

I have seen ctk.c obfuscated code, but How can I start to de-obfuscate it?

#include 
#include 
#include 
#include          


        
1条回答
  •  醉话见心
    2021-02-06 13:05

    1st step

    Using:

    sed -e'/#include/d' ctk.c | gcc -E - | sed -e's/;/;\n/g' -e's/}/}\n/g' -e '/^#/d' | indent
    

    I was able to generate the following output which while not perfect already seems to be readable a lot better:

    char x[] = "((((((((((((((((((((((", w[] =
      "\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b";
    char r[] = { 92, 124, 47 }
    
    , l[] =
    {
    2, 3, 1, 0}
    
    ;
    char *T[] = { "  |", "  |", "%\\|/%", " %%%", "" }
    
    ;
    char d = 1, p = 40, o = 40, k = 0, *a, y, z, g = -1, G, X, **P = &T[4], f = 0;
    unsigned int s = 0;
    void
    u (int i)
    {
      int n;
      printf ("\233;
    %uH\233L%c\233;
    %uH%c\233;
    %uH%s\23322;
    %uH@\23323;
    %uH \n", *x - *w, r[d], *x + *w, r[d], X, *P, p += k, o);
      if (abs (p - x[21]) >= w[21])
        exit (0);
      if (g != G)
        {
          struct itimerval t = { 0, 0, 0, 0 }
          ;
          g += ((g < G) << 1) - 1;
          t.it_interval.tv_usec = t.it_value.tv_usec = 72000 / ((g >> 3) + 1);
          setitimer (0, &t, 0);
          f && printf ("\e[10;
    %u]", g + 24);
        }
      f && putchar (7);
      s += (9 - w[21]) * ((g >> 3) + 1);
      o = p;
      a = x;
      z = *a;
      while (*++a)
        {
          y = *a;
          *a = z;
          z = y;
        }
      ;
      a = w;
      z = *a;
      while (*++a)
        {
          y = *a;
          *a = z;
          z = y;
        }
      ;
      (n = rand ()) & 255 || --*w || ++*w;
      if (!(**P && P++ || n & 7936))
        {
          while (abs ((X = rand () % 76) - *x + 2) - *w < 6);
          ++X;
          P = T;
        }
      (n = rand () & 31) < 3 && (d = n);
      !d && --*x <= *w && (++*x, ++d) || d == 2 && ++*x + *w > 79 && (--*x, --d);
      signal (i, u);
    }
    
    void
    e ()
    {
      signal (14, SIG_IGN);
      printf ("\e[0q\ecScore: %u\n", s);
      system ("stty echo -cbreak");
    }
    
    int main (int C, char **V)
    {
      atexit (e);
      (C < 2 || *V[1] != 113)
        && (f = (C = *(int *) getenv ("TERM")) == (int) 0x756E696C
        || C == (int) 0x6C696E75);
      srand (getpid ());
      system ("stty -echo cbreak");
      G = 0 << 3;
      printf ("\e[%uq", l[0]);
      u (14);
      for (;;)
        switch (getchar ())
          {
          case 113:
        return 0;
          case 91:
          case 98:
          case 44:
        k = -1;
        continue;
          case 32:
          case 110:
          case 46:
        k = 0;
        continue;
          case 93:
          case 109:
          case 47:
        k = 1;
        continue;
          case 49:
        G = 0 << 3;
        printf ("\e[%uq", l[0]);
        continue;
          case 50:
        G = 1 << 3;
        printf ("\e[%uq", l[1]);
        continue;
          case 51:
        G = 2 << 3;
        printf ("\e[%uq", l[2]);
        continue;
          case 52:
        G = 3 << 3;
        printf ("\e[%uq", l[3]);
        continue;
          }
    }
    

    ... and now?

    I don't think there's much more an automated process will be able perform at this point as the term "more" readable or "less" readable from now on might depend on the specific preferences of the reader.

    One step that could be performed is removing escape sequences from the strings and placing them somewhere separately. As it turns out the whole

    char l[] = {2, 3, 1, 0}
    

    has no other purpose than to be utilized in the escape sequences of the main loop:

    printf ("\e[%uq", l[0]);
    

    and so on. Looking up their meaning:

    ESC [ 0 q: clear all LEDs
    ESC [ 1 q: set Scroll Lock LED
    ESC [ 2 q: set Num Lock LED
    ESC [ 3 q: set Caps Lock LED
    

    depending on taste you might want to exchange them with a macro or a function call more meaningful to you like clear_all_LEDs and so on.

    I strongly doubt a machine would agree on this being a simplification. As it turns out the whole main loop just seems to be working with keys entered by the user, so probably turning numbers into their corresponding characters might add to readability, like in replacing:

    case 113:
      return 0;
    case 91:
    case 98:
    case 44:
      k = -1;
    // ...
    case 49:
      G = 0 << 3;
      printf ("\e[%uq", l[0]);
    

    with something like:

    case 'q':
      return 0;
    case '[':
    case 'b':
    case ',':
      k = -1;
    // ...
    case '1':
      G = 0 << 3;
      set_Num_Lock_LED ();
    

    Oh - and while we are at it already why wouldn't we want to change the name from this rather strange G to gear. Again I strongly doubt an automated process would have found renaming from G to gear any better than renaming it to butterfly. Well maybe it even isn't.

    While beautifying names maybe this function referenced by a single u is another candidate:

    u (14);
    

    with a more meaningful name update probably. And as we already included why don't we deobfuscate the code further by replacing 14 with SIGALRM like this:

    upadate (SIGALRM);
    

    As you can see "deobfuscating" here requires the exact opposite step of that taken before. Replacing the expansion with a macro this time. How would a machine try to decide which one is more useful?

    Another spot where we might want to replace a bare number with something else is this one in the update function:

    f && putchar (7);
    

    Why not replace the 7 with \a as it will turn out to be the same in the end. Maybe we should even change the bare f with something more "meaningful".

    Again I vote agains butterfly but would rather like to call it play_sound:

    if (play_sound)
       putchar ('\a');
    

    might be the more readable version we are looking for. Sure we shouldn't forget to replace f in all other spots. The one right at the beginning of our main function beeing such a culprit:

    Holy mess

    int main (int C, char **V)
    {
      atexit (e);
      (C < 2 || *V[1] != 113)
        && (f = (C = *(int *) getenv ("TERM")) == (int) 0x756E696C
        || C == (int) 0x6C696E75);
    

    While happily renaming f to play_sound and e to - no, still no butterfly, this time I'll rather call it: - end we spot that the function signature seems to look a bit strange in terms of naming conventions: argc instead of C and argv instead of V would seem more conventional here. Thus giving us:

    int main (int argc, char* argv[])
    {
      atexit (end);
      (argc < 2 || *argv[1] != 113)
        && (playsound = (argc = *(int *) getenv ("TERM")) == (int) 0x756E696C
        || argc == (int) 0x6C696E75);
    

    As this is still not a beauty we ask our standards guy and he informs us that it's pretty OK to replace

    (A || B) && (C)
    

    with

    if (A || B) { C }
    

    and

    E = (x=F)==H || x==I
    

    with

    x = F; 
    if (x==H || x==I) 
      A=1; 
    else 
      A=0;` 
    

    So maybe this should be a more readable version of the whole code:

    if (argc < 2 || *argv[1] != 'q') {
       argc = *(int*) getenv ("TERM");
       if (argc == (int) 0x756E69 || argc == (int) 0x6C696E75))
         play_sound = 1;
       /* skip the else brach here as play_sound is alredy initialized to 0 */
    }
    

    Now still another guy turns up and starts to inform us, that depending on something called endianness tose strange looking numbers 0x6C696E75 and 0x756E69 if stored in memory would (when interpreting raw byte vales as ASCII code) just look like "linu" or "unil". One being "unil" on one architecure type and "linu" the other one while just the other way round on the other architecture with different endianness.

    So taking a closer look what's essentially happening here is:

    • we get a pointer to a string from getenv ("TERM") which we typcast to a pointer to an int before dereferencing it thus leading the bit pattern stored at the string location as an int.
    • next we compare this value with the one we would get if had performed the same with either "unil" or "linu" stored at that specific location.

    Probably we just want to check if the TERM environment variable is set to "linux" so our deobfuscated version might want to perform a string comparison here.

    As on the other hand we can't be sure if also allowing terminals with names starting with "unil" to play sound might be a special feature of this software so I decided to probably better leave it intact.

    What now ?

    While renaming and re-encoding variable names and values those strange char arrays could be our next victims. The following mess doesn't look too nice:

    char x[] = "((((((((((((((((((((((", w[] =
      "\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b";
    char r[] = { 92, 124, 47 };
    

    So maybe they could be changed to:

    char x_offset[] = {
      40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
      40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
      40, 40, 0 };
    
    char width[] = {
      8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
      8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
      8, 8, 0 };
    
    const char border[] = "\\|/";
    

    As you can see I just chose to switch the way the values are described between x as string constant to x written down as an array as this way the purpose of the values stored here seemed a little bit clearer to me.

    While on the other hand I changed the way the way r is written down just in exactly the opposite direction as again this seemed a lot clearer to me.

    While hunting down all those refs to x, w and r the time could be used to rename p and o to - sorry again no butterfly - pos and old_pos while renaming s to score.

    Changing for example:

      s += (9 - w[21]) * ((g >> 3) + 1);
      o = p;
      a = x;
      z = *a;
      while (*++a)
        {
          y = *a;
          *a = z;
          z = y;
        }
      ;
      a = w;
      z = *a;
      while (*++a)
        {
          y = *a;
          *a = z;
          z = y;
        }
      ;
    

    to:

      /* update score */
      score += (9 - width[NEXT_LINE]) * ((g >> 3) + 1);
      old_pos = pos;
    
      /* shift x_offset */
      a = x_offset;
      z = *a;
      while (*++a) {
        y = *a;
        *a = z;
        z = y;
      };
    
      /* shift width */
      a = width;
      z = *a;
      while (*++a) {
        y = *a;
        *a = z;
        z = y;
      };
    

    Besides the possibility to turn it into some other kind of loop there's not much beautification possible for both shifting functions so probably adding an appropriate comment is the maximum you can do. Removing the magic number 21 might be another idea NEXT_LINE didn't seem to be the worst choice here.

    The single character labeled variable g still doesn't look too good. But renaming it to something like update_interval there's also the chance to eliminate another weird terminal escape sequence:

     if (g != G)
        {
          struct itimerval t = { 0, 0, 0, 0 }
          ;
          g += ((g < G) << 1) - 1;
          t.it_interval.tv_usec = t.it_value.tv_usec = 72000 / ((g >> 3) + 1);
          setitimer (0, &t, 0);
          f && printf ("\e[10;
    %u]", g + 24);
        }
    

    Maybe looks a little bit more confusing than:

      /* update simulation speed */
      if (update_interval != gear) {
        struct itimerval t = { 0, 0, 0, 0 }  ;
          update_interval += ((update_interval < gear) << 1) - 1;
          t.it_interval.tv_usec = t.it_value.tv_usec = 72000 / ((update_interval >> 3) + 1);
          setitimer (0, &t, 0);
          if (play_sound)
            change_bell_frequency (update_interval + 24);
      }
    

    Last fixes

    Although the code should look a lot more readable by now there are still some nasty parts left:

    !d && --*x <= *w && (++*x, ++d) || d == 2 && ++*x + *w > 79 && (--*x, --d);
    

    Choosing another (hopefully) more meaningful name for d and breaking operator precedence down you might end up with something like:

      if (curve == CURVE_LEFT) {
        --*x_offset;
        if (*x_offset < *width) {
           ++*x_offset;
           curve = CURVE_NONE;
        }
      }
      else if (curve == CURVE_RIGHT) {
        ++*x_offset;
        if (*x_offset + *width > 79) {
          --*x_offsett;
          curve = CURVE_NONE;
        }
      } 
    

    instead adding appropriate macros for all those CURVE_...s.

    Now there are still those X, P and T names hanging around that also might be changed. As it makes its purpose also a little bit better visible in code I decided to flip the line order of T that I renamed to tree which sure means the calculation also has to be fixed. All in all it's from:

    char *T[] = { "  |", "  |", "%\\|/%", " %%%", "" };
    char X, **P = &T[4];
    
    // ...
    
      if (!(**P && P++ || n & 7936))
        {
          while (abs ((X = rand () % 76) - *x + 2) - *w < 6);
          ++X;
          P = T;
        }
    

    To something like:

    char *tree[] = {
      "",
      " %%%",
      "%\\|/%",
      "  |",
      "  |",
    };
    
    char **tree_line = tree;
    char tree_position;
    
    // ...
    
      /* update tree line pointer */
      if (!(**tree_line && tree_line-- || n & 7936)) {
        /* find the right spot to grow */
        while (abs ((tree_position = rand () % 76) - *x_offset + 2) - *width < 6)
          ;
        ++tree_position;
        tree_line = &tree[4];
      }
    

    Keeping the best part until the end

    Although the code already seems to looks a lot prettier to me now there's still one part missing. That's the one that's doing all the output. It's this line I'm talking about:

     printf ("\233;%uH\233L%c\233;%uH%c\233;%uH%s\23322;%uH@\23323;%uH \n",
          *x - *w, r[d], *x + *w, r[d], X, *P, p += k, o); 
    

    That apart from looking pretty hard to read was even to obfuscated for computer to produce any usable result.

    I tried a lot of different things running in other terminal emulators, changing terminal settings and switching locales back and forth without sucess.

    So besides the fact this kind of obfuscation seemed to be more that perfect as it even seems to confuse my computer I still can't tell what trick the author intended here.

    The octal code \233 has the same bit-pattern as the escape character (\033) with the 8-th bit set additionally which probably is in some way related to effect that was intended here. Unfortunately as I already told it didn't work for me.

    Fortunately enough the escape sequences still seemed easy enough to guess, so I came up with the following replacement:

    pos += move_x,

      /* draw street */
      printf ("\e[1;%uH" "\e[L" "%c"
              "\e[1;%uH" "%c",
              *x_offset - *width, border[curve],
              *x_offset + *width, border[curve]);
      /* draw tree */
      printf ("\e[1;%uH" "%s",
              tree_position, *tree_line);
    
      /* redraw car */
      printf ("\e[22;%uH" "@"
              "\e[23;%uH" " " "\n",
              pos,
              old_pos);  
    

    Taking drawing down into separate to (hopefully) make them a little bit more readable. The actual line and the previous line are still hard coded here as in the original version. Maybe extracting them from there as shown below would even improve readability:

      /* draw street */
      printf ("\e[1;%uH" "\e[L" "%c"
              "\e[1;%uH" "%c",
              *x_offset - *width, border[curve],
              *x_offset + *width, border[curve]);
      /* draw tree */
      printf ("\e[1;%uH" "%s",
              tree_position, *tree_line);
    
      /* redraw car */
      printf ("\e[%u;%uH" "@"
              "\e[%u;%uH" " " "\n",
              NEXT_LINE +1, pos,
              NEXT_LINE +2, old_pos);
    

    This finally brought me to the first usable version which I then "tested" a lot. While probably not 100% state of the art it still seems to be very addictive.

    Last words

    Here the final unobfuscated version that I came with. As you'll see I didn't implement the LED setting functions and the clear screen function but it shouldn't be to hard to find the needed escape sequences scattered throughout the obfuscated version. In fact I already mentioned the LED sequences in this post. The one to clear the screen is "\e[0q". Happy hacking.

    #include 
    #include 
    #include 
    #include 
    #include 
    
    #define NEXT_LINE 21
    
    #define CURVE_LEFT 0
    #define CURVE_NONE 1
    #define CURVE_RIGHT 2
    
    char x_offset[] = {
      40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
      40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
      40, 40, 0 };
    
    char width[] = {
      8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
      8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
      8, 8, 0 };
    
    const char border[] = "\\|/";
    
    void change_bell_frequency () {}
    void clear_screen () {}
    void clear_all_LEDs () {}
    void set_Num_Lock_LED () {}
    void set_Scroll_lock_LED () {}
    void set_Caps_Lock_LED () {}
    
    
    
    char *tree[] = {
      "",
      " %%%",
      "%\\|/%",
      "  |",
      "  |",
    };
    
    
    char **tree_line = tree;
    char tree_position;
    
    char curve = CURVE_NONE;
    char *a, y, z;
    
    char move_x = 0;
    char update_interval = -1;
    
    char pos = 40;
    char old_pos = 40;
    
    char play_sound = 0;
    char gear;
    
    unsigned int score = 0;
    
    void move (char x, char y) {
      printf ("\e[%u;%uH", x, y);
    }
    
    void insert () {
      printf ("\e[L");
    }
    
    void update (int i) {
      int n;
    
      pos += move_x,
    
      /* draw street */
      printf ("\e[1;%uH" "\e[L" "%c"
              "\e[1;%uH" "%c",
              *x_offset - *width, border[curve],
              *x_offset + *width, border[curve]);
      /* draw tree */
      printf ("\e[1;%uH" "%s",
              tree_position, *tree_line);
    
      /* redraw car */
      printf ("\e[%u;%uH" "@"
              "\e[%u;%uH" " " "\n",
              NEXT_LINE + 1, pos,
              NEXT_LINE +2, old_pos);
    
      /* did we leave the road ? */
      if (abs (pos - x_offset[NEXT_LINE]) >= width[NEXT_LINE])
        exit (0);
    
      /* update simulation speed */
      if (update_interval != gear) {
        struct itimerval t = { 0, 0, 0, 0 }  ;
          update_interval += ((update_interval < gear) << 1) - 1;
          t.it_interval.tv_usec = t.it_value.tv_usec = 72000 / ((update_interval >> 3) + 1);
          setitimer (0, &t, 0);
          if (play_sound)
            change_bell_frequency (update_interval + 24);
      }
    
      /* play sound */
      if (play_sound)
        putchar ('\a');
    
      /* update score */
      score += (9 - width[NEXT_LINE]) * ((update_interval >> 3) + 1);
      old_pos = pos;
    
      /* shift x_offset */
      a = x_offset;
      z = *a;
      while (*++a) {
        y = *a;
        *a = z;
        z = y;
      };
    
      /* shift width */
      a = width;
      z = *a;
      while (*++a) {
        y = *a;
        *a = z;
        z = y;
      };
    
      /* generate new road */
      n = rand ();
    
      if (!(n & 255) && *width > 1)
        --*width;
    
      /* set tree line pointer */
      if (!(**tree_line && tree_line-- || n & 7936)) {
        /* find the right spot to grow */
        while (abs ((tree_position = rand () % 76) - *x_offset + 2) - *width < 6)
          ;
        ++tree_position;
        tree_line = &tree[4];
      }
    
      /* new offset */
      n = rand () & 31;
      if (n < 3)
        curve = n;
    
      if (curve == CURVE_LEFT) {
        --*x_offset;
        if (*x_offset <= *width) {
          ++*x_offset;
          curve = CURVE_NONE;
        }
      }
      else if (curve == CURVE_RIGHT) {
        ++*x_offset;
        if (*x_offset + *width > 79) {
          --*x_offset;
          curve = CURVE_NONE;
        }
      }
    
      signal (SIGALRM, update);
    }
    
    
    void end () {
      signal (SIGALRM, SIG_IGN);
      clear_all_LEDs ();
      clear_screen ();
      printf ("Score: %u\n", score);
      system ("stty echo -cbreak");
    }
    
    
    int main (int argc, char **argv) {
      atexit (end);
    
      if (argc < 2 || *argv[1] != 'q') {
        argc = *(int*) getenv ("TERM");
        if (argc == (int) 0x6C696E75 || argc == (int) 0x756E696C)
          play_sound = 1;
      }
    
      srand (getpid ());
      system ("stty -echo cbreak");
      gear = 0 << 3;
    
      clear_all_LEDs ();
      update (14);
      for (;;)
        switch (getchar ())
          {
            case 'q':
              return 0;
            case '[':
            case 'b':
            case ',':
              move_x = -1;
              continue;
            case ' ':
            case 'n':
            case '.':
              move_x = 0;
              continue;
            case ']':
            case 'm':
            case '/':
              move_x = 1;
              continue;
            case '1':
              gear = 0 << 3;
              set_Num_Lock_LED ();
              continue;
            case '2':
              gear = 1 << 3;
              set_Caps_Lock_LED ();
              continue;
            case '3':
              gear = 2 << 3;
              set_Scroll_lock_LED ();
              continue;
            case '4':
              gear = 3 << 3;
              clear_all_LEDs ();
              continue;
          }
    }
    

    0 讨论(0)
提交回复
热议问题