How to implement the Softmax derivative independently from any loss function?

前端 未结 4 1652
夕颜
夕颜 2021-02-05 16:04

For a neural networks library I implemented some activation functions and loss functions and their derivatives. They can be combined arbitrarily and the derivative at the output

相关标签:
4条回答
  • 2021-02-05 16:33

    It should be like this: (x is the input to the softmax layer and dy is the delta coming from the loss above it)

        dx = y * dy
        s = dx.sum(axis=dx.ndim - 1, keepdims=True)
        dx -= y * s
    
        return dx
    

    But the way you compute the error should be:

        yact = activation.compute(x)
        ycost = cost.compute(yact)
        dsoftmax = activation.delta(x, cost.delta(yact, ycost, ytrue)) 
    

    Explanation: Because the delta function is a part of the backpropagation algorithm, its responsibility is to multiply the vector dy (in my code, outgoing in your case) by the Jacobian of the compute(x) function evaluated at x. If you work out what does this Jacobian look like for softmax [1], and then multiply it from the left by a vector dy, after a bit of algebra you'll find out that you get something that corresponds to my Python code.

    [1] https://stats.stackexchange.com/questions/79454/softmax-layer-in-a-neural-network

    0 讨论(0)
  • 2021-02-05 16:44

    Mathematically, the derivative of Softmax σ(j) with respect to the logit Zi (for example, Wi*X) is

    where the red delta is a Kronecker delta.

    If you implement iteratively:

    def softmax_grad(s):
        # input s is softmax value of the original input x. Its shape is (1,n) 
        # i.e.  s = np.array([0.3,0.7]),  x = np.array([0,1])
    
        # make the matrix whose size is n^2.
        jacobian_m = np.diag(s)
    
        for i in range(len(jacobian_m)):
            for j in range(len(jacobian_m)):
                if i == j:
                    jacobian_m[i][j] = s[i] * (1 - s[i])
                else: 
                    jacobian_m[i][j] = -s[i] * s[j]
        return jacobian_m
    

    Test:

    In [95]: x
    Out[95]: array([1, 2])
    
    In [96]: softmax(x)
    Out[96]: array([ 0.26894142,  0.73105858])
    
    In [97]: softmax_grad(softmax(x))
    Out[97]: 
    array([[ 0.19661193, -0.19661193],
           [-0.19661193,  0.19661193]])
    

    If you implement in a vectorized version:

    soft_max = softmax(x)    
    
    # reshape softmax to 2d so np.dot gives matrix multiplication
    
    def softmax_grad(softmax):
        s = softmax.reshape(-1,1)
        return np.diagflat(s) - np.dot(s, s.T)
    
    softmax_grad(soft_max)
    
    #array([[ 0.19661193, -0.19661193],
    #       [-0.19661193,  0.19661193]])
    
    0 讨论(0)
  • 2021-02-05 16:49

    Just in case you are processing in batches, here is an implementation in NumPy (tested vs TensorFlow). However, I will suggest avoiding the associated tensor operations, by mixing the jacobian with the cross-entropy, which leads to a very simple and efficient expression.

    def softmax(z):
      exps = np.exp(z - np.max(z))
      return exps / np.sum(exps, axis=1, keepdims=True)
    
    def softmax_jacob(s):
      return np.einsum('ij,jk->ijk', s, np.eye(s.shape[-1])) \
           - np.einsum('ij,ik->ijk', s, s)
    
    def np_softmax_test(z):
      return softmax_jacob(softmax(z))
    
    def tf_softmax_test(z):
      z = tf.constant(z, dtype=tf.float32)
      with tf.GradientTape() as g:
        g.watch(z)
        a = tf.nn.softmax(z) 
      jacob = g.batch_jacobian(a, z)
      return jacob.numpy()
    
    z = np.random.randn(3, 5)
    np.all(np.isclose(np_softmax_test(z), tf_softmax_test(z)))
    
    0 讨论(0)
  • 2021-02-05 16:54

    Here is a c++ vectorized version, using intrinsics ( 22 times (!) faster than the non-SSE version):

    // How many floats fit into __m256 "group".
    // Used by vectors and matrices, to ensure their dimensions are appropriate for 
    // intrinsics.
    // Otherwise, consecutive rows of matrices will not be 16-byte aligned, and 
    // operations on them will be incorrect.
    #define F_MULTIPLE_OF_M256 8
    
    
    //check to quickly see if your rows are divisible by m256.
    //you can 'undefine' to save performance, after everything was verified to be correct.
    #define ASSERT_THE_M256_MULTIPLES
    #ifdef ASSERT_THE_M256_MULTIPLES
        #define assert_is_m256_multiple(x)  assert( (x%F_MULTIPLE_OF_M256) == 0)
    #else
        #define assert_is_m256_multiple (q) 
    #endif
    
    
    // usually used at the end of our Reduce functions,
    // where the final __m256 mSum needs to be collapsed into 1 scalar.
    static inline float slow_hAdd_ps(__m256 x){
        const float *sumStart = reinterpret_cast<const float*>(&x);
        float sum = 0.0f;
    
        for(size_t i=0; i<F_MULTIPLE_OF_M256; ++i){
            sum += sumStart[i];
        }
        return sum;
    }
    
    
    
    f_vec SoftmaxGrad_fromResult(const float *softmaxResult,  size_t size,  
                                 const float *gradFromAbove){//<--gradient vector, flowing into us from the above layer
    assert_is_m256_multiple(size);
    //allocate vector, where to store output:
    f_vec grad_v(size, true);//true: skip filling with zeros, to save performance.
    
    const __m256* end   = (const __m256*)(softmaxResult + size);
    
    
    for(size_t i=0; i<size; ++i){// <--for every row
        //go through this i'th row:
        __m256 sum =  _mm256_set1_ps(0.0f);
    
        const __m256 neg_sft_i  =  _mm256_set1_ps( -softmaxResult[i] );
        const __m256 *s  =  (const __m256*)softmaxResult;
        const __m256 *gAbove  =   (__m256*)gradFromAbove;
    
        for (s;  s<end; ){
            __m256 mul =  _mm256_mul_ps(*s, neg_sft_i);  //  sftmaxResult_j  *  (-sftmaxResult_i)
            mul =  _mm256_mul_ps( mul, *gAbove );
    
            sum =  _mm256_add_ps( sum,  mul );//adding to the total sum of this row.
            ++s;
            ++gAbove;
        }
        grad_v[i]  =  slow_hAdd_ps( sum );//collapse the sum into 1 scalar (true sum of this row).
    }//end for every row
    
    //reset back to start and subtract a vector, to account for Kronecker delta:
    __m256 *g =  (__m256*)grad_v._contents;
    __m256 *s =  (__m256*)softmaxResult;
    __m256 *gAbove =  (__m256*)gradFromAbove;
    
    for(s; s<end; ){
        __m256 mul = _mm256_mul_ps(*s, *gAbove);
        *g = _mm256_add_ps( *g, mul );
        ++s; 
        ++g;
    }
    
    return grad_v;
    
    }
    

    If for some reason somebody wants a simple (non-SSE) version, here it is:

    inline static void SoftmaxGrad_fromResult_nonSSE(const float* softmaxResult,  
                                                     const float *gradFromAbove,  //<--gradient vector, flowing into us from the above layer
                                                     float *gradOutput,  
                                                     size_t count ){
        // every pre-softmax element in a layer contributed to the softmax of every other element
        // (it went into the denominator). So gradient will be distributed from every post-softmax element to every pre-elem.
        for(size_t i=0; i<count; ++i){
            //go through this i'th row:
            float sum =  0.0f;
    
            const float neg_sft_i  =  -softmaxResult[i];
    
            for(size_t j=0; j<count; ++j){
                float mul =  gradFromAbove[j] * softmaxResult[j] * neg_sft_i;
                sum +=  mul;//adding to the total sum of this row.
            }
            //NOTICE: equals, overwriting any old values:
            gradOutput[i]  =  sum;
        }//end for every row
    
        for(size_t i=0; i<count; ++i){
            gradOutput[i] +=  softmaxResult[i] * gradFromAbove[i];
        }
    }
    
    0 讨论(0)
提交回复
热议问题