Performance hit of vtable lookup in C++

前端 未结 6 1163
日久生厌
日久生厌 2020-12-24 03:52

I\'m evaluating to rewrite a piece of real-time software from C/assembly language to C++/assembly language (for reasons not relevant to the question parts of the code are ab

相关标签:
6条回答
  • 2020-12-24 04:04

    What makes you think vtable lookup overhead is 20 cycles? If that's really true, you need a better C++ compiler.

    I tried this on an Intel box, not knowing anything about the processor you're using, and as expected the difference between the C despatch code and the C++ vtable despatch is one instruction, having to do with the fact that the vtable involves an extra indirect.

    C code (based on OP):

    void (*todolist[200])(void *parameters);                                  
    void *paramlist[200];
    void realtime(void)
    {       
      int i;
      for (i = 0; i < 200; i++)                                               
        (*todolist[i])(paramlist[i]);                                         
    }       
    

    C++ code:

    class Base {
      public:
        Base(void* unsafe_pointer) : unsafe_pointer_(unsafe_pointer) {}
        virtual void operator()() = 0;
      protected:
        void* unsafe_pointer_;
    };
    
    Base* todolist[200];
    void realtime() {
      for (int i = 0; i < 200; ++i)
        (*todolist[i])();
    }
    

    Both compiled with gcc 4.8, -O3:

    realtime:                             |_Z8realtimev:
    .LFB0:                                |.LFB3:
            .cfi_startproc                |        .cfi_startproc
            pushq   %rbx                  |        pushq   %rbx
            .cfi_def_cfa_offset 16        |        .cfi_def_cfa_offset 16
            .cfi_offset 3, -16            |        .cfi_offset 3, -16
            xorl    %ebx, %ebx            |        movl    $todolist, %ebx
            .p2align 4,,10                |        .p2align 4,,10
            .p2align 3                    |        .p2align 3
    .L3:                                  |.L3:
            movq    paramlist(%rbx), %rdi |        movq    (%rbx), %rdi
            call    *todolist(%rbx)       |        addq    $8, %rbx
            addq    $8, %rbx              |        movq    (%rdi), %rax
                                          |        call    *(%rax)
            cmpq    $1600, %rbx           |        cmpq    $todolist+1600, %rbx
            jne     .L3                   |        jne     .L3
            popq    %rbx                  |        popq    %rbx
            .cfi_def_cfa_offset 8         |        .cfi_def_cfa_offset 8
            ret                           |        ret
    

    In the C++ code, the first movq gets the address of the vtable, and the call then indexes through that. So that's one instruction overhead.

    According to OP, the DSP's C++ compiler produces the following code. I've inserted comments based on my understanding of what's going on (which might be wrong). Note that (IMO) the loop starts one location earlier than OP indicates; otherwise, it makes no sense (to me).

    # Initialization.
    # i3=todolist; i5=paramlist           | # i5=todolist holds paramlist
    i3=0xb27ba;                           | # No paramlist in C++
    i5=0xb28e6;                           | i5=0xb279a;
    # r15=count
    r15=0xc8;                             | r15=0xc8;
    
    # Loop. We need to set up r4 (first parameter) and figure out the branch address.
    # In C++ by convention, the first parameter is 'this'
    # Note 1:
    r4=dm(i5,m6); # r4 = *paramlist++;    | i5=modify(i5,m6); # i4 = *todolist++
                                          | i4=dm(m7,i5);     # ..
    # Note 2:                            
                                          | r2=i4;            # r2 = obj
                                          | i4=dm(m6,i4);     # vtable = *(obj + 1)
                                          | r1=dm(0x3,i4);    # r1 = vtable[3]
                                          | r4=r2+r1;         # param = obj + r1
    
    i12=dm(i3,m6); # i12 = *todolist++;   | i12=dm(0x5,i4);   # i12 = vtable[5]
    
    # Boilerplate call. Set frame pointer, push return address and old frame pointer.
    # The two (push) instructions after jump are actually executed before the jump.
    r2=i6;                                | r2=i6;
    i6=i7;                                | i6=i7;
    jump (m13,i12) (db);                  | jump (m13,i12) (db);
    dm(i7,m7)=r2;                         | dm(i7,m7)=r2;
    dm(i7,m7)=0x1279de;                   | dm(i7,m7)=0x1279e2;
    
    # if (count--) loop
    r15=r15-1;                            | r15=r15-1;
    if ne jump (pc, 0xfffffff2);          | if ne jump (pc, 0xffffffe7);
    

    Notes:

    1. In the C++ version, it seems that the compiler has decided to do the post-increment in two steps, presumably because it wants the result in an i register rather than in r4. This is undoubtedly related to the issue below.

    2. The compiler has decided to compute the base address of the object's real class, using the object's vtable. This occupies three instructions, and presumably also requires the use of i4 as a temporary in step 1. The vtable lookup itself occupies one instruction.

    So: the issue is not vtable lookup, which could have been done in a single extra instruction (but actually requires two). The problem is that the compiler feels the need to "find" the object. But why doesn't gcc/i86 need to do that?

    The answer is: it used to, but it doesn't any more. In many cases (where there is no multiple inheritance, for example), the cast of a pointer to a derived class to a pointer of a base class does not require modifying the pointer. Consequently, when we call a method of the derived class, we can just give it the base class pointer as its this parameter. But in other cases, that doesn't work, and we have to adjust the pointer when we do the cast, and consequently adjust it back when we do the call.

    There are (at least) two ways to perform the second adjustment. One is the way shown by the generated DSP code, where the adjustment is stored in the vtable -- even if it is 0 -- and then applied during the call. The other way, (called vtable-thunks) is to create a thunk -- a little bit of executable code -- which adjusts the this pointer and then jumps to the method's entry point, and put a pointer to this thunk into the vtable. (This can all be done at compile time.) The advantage of the thunk solution is that in the common case where no adjustment needs to be done, we can optimize away the thunk and there is no adjustment code left. (The disadvantage is that if we do need an adjustment, we've generated an extra branch.)

    As I understand it, VisualDSP++ is based on gcc, and it might have the -fvtable-thunks and -fno-vtable-thunks options. So you might be able to compile with -fvtable-thunks. But if you do that, you would need to compile all the C++ libraries you use with that option, because you cannot mix the two calling styles. Also, there were (15 years ago) various bugs in gcc's vtable-thunks implementation, so if the version of gcc used by VisualDSP++ is old enough, you might run into those problems too (IIRC, they all involved multiple inheritance, so they might not apply to your use case.)


    (Original test, before update):

    I tried the following simple case (no multiple inheritance, which can slow things down):

    class Base {
      public:
        Base(int val) : val_(val) {}
        virtual int binary(int a, int b) = 0;
        virtual int unary(int a) = 0;
        virtual int nullary() = 0;
      protected:
        int val_;
    };
    
    int binary(Base* begin, Base* end, int a, int b) {
      int accum = 0;
      for (; begin != end; ++begin) { accum += begin->binary(a, b); }
      return accum;
    }
    
    int unary(Base* begin, Base* end, int a) {
      int accum = 0;
      for (; begin != end; ++begin) { accum += begin->unary(a); }
      return accum;
    }
    
    int nullary(Base* begin, Base* end) {
      int accum = 0;
      for (; begin != end; ++begin) { accum += begin->nullary(); }
      return accum;
    }
    

    And compiled it with gcc (4.8) using -O3. As I expected, it produced exactly the same assembly code as your C despatch would have done. Here's the for loop in the case of the unary function, for example:

    .L9:
            movq    (%rbx), %rax
            movq    %rbx, %rdi
            addq    $16, %rbx
            movl    %r13d, %esi
            call    *8(%rax)
            addl    %eax, %ebp
            cmpq    %rbx, %r12
            jne     .L9
    
    0 讨论(0)
  • 2020-12-24 04:05

    As has already been mentioned, you can use templates to do away with the dynamic dispatch. Here is an example that does this:

    template <typename FirstCb, typename ... RestCb>
    struct InterruptHandler {
        void execute() {
            // I construct temporary objects here since I could not figure out how you
            // construct your objects. You can change these signatures to allow for 
            // passing arbitrary params to these handlers.
            FirstCb().execute();
            InterruptHandler<RestCb...>().execute();
        }
    }
    
    InterruptHandler</* Base, Derived1, and so on */> handler;
    
    void realtime(void) {
        handler.execute();
    }
    

    This should completely eliminate the vtable lookups while providing more opportunities for compiler optimization since the code inside execute can be inlined.

    Note however that you will need to change some parts depending on how you initialize your handlers. The basic framework should remain the same. Also, this requires that you have a C++11 compliant compiler.

    0 讨论(0)
  • 2020-12-24 04:26

    I suggest using static methods in your derived classes and placing these functions into your array. This would eliminate the overhead of the v-table search. This is closest to your C language implementation.

    You may end up sacrificing the polymorphism for speed.
    Is the inheritance necessary?
    Just because you switch to C++ doesn't mean you have to switch to Object Oriented.

    Also, have you tried unrolling your loop in the ISR?
    For example, perform 2 or more execution calls before returning back to the top of the loop.

    Also, can you move any of the functionality out of the ISR? Can any part of the functionality be performed by the "background loop" instead of the ISR? This would reduce the time in your ISR.

    0 讨论(0)
  • 2020-12-24 04:26

    You can hide the void* type erasure and type recovery inside templates. The result would (hopefully) be the same array to function pointers. This yould help with casting and compatible to your code:

    #include <iostream>
    
    template<class ParamType,class F>
    void fun(void* param) {
      F f;
      f(*static_cast<ParamType*>(param));
    }
    
    struct my_function {
      void operator()(int& i) {
        std::cout << "got it " << i << std::endl;
      }
    };
    
    
    int main() {
      void (*func)(void*) = fun<int, my_function>;
    
      int j=4;
      func(&j);
    
      return 0;
    }
    

    In this case you can create new functions as a function object with more type safty. The "normal" OOP aproach with virtual functions doesn't help here.

    In case of A C++11 environment you could create the array with help of variadic templates at compile time (but with an complicated syntax).

    0 讨论(0)
  • 2020-12-24 04:26

    This is unrelated to your question, but if you are that keen on performance you could use templates to do a loop unroll for the todolist:

    void (*todo[3])(void *);
    void *param[3];
    
    void f1(void*) {std::cout<<"1" << std::endl;}
    void f2(void*) {std::cout<<"2" << std::endl;}
    void f3(void*) {std::cout<<"3" << std::endl;}
    
    template<int N>
    struct Obj {
        static void apply()
        {
            todo[N-1](param[N-1]);
            Obj<N-1>::apply();
        }
    };
    
    template<> struct Obj<0> { static void apply() {} };
    
    todo[0] = f1;
    todo[1] = f2;
    todo[2] = f3;
    
    Obj<sizeof todo / sizeof *todo>::apply();
    
    0 讨论(0)
  • 2020-12-24 04:29

    Find out where your compiler puts the vtable and access it directly to get the function pointers and store them for usage. That way you will have pretty much the same approach like in C with an array of function pointers.

    0 讨论(0)
提交回复
热议问题