Edit
Anybody with a similar problem - I found another SO answer here with a great python solution that exploits the speed of NumPy.
Ple
here is C++ code gives exactly the result you want.
// http://jepsonsblog.blogspot.be/2012/10/overlay-transparent-image-in-opencv.html
// https://gist.github.com/maximus5684/082f8939edb6aed7ba0a
#include "opencv2/imgproc.hpp"
#include "opencv2/highgui.hpp"
#include "iostream"
using namespace cv;
using namespace std;
void overlayImage(Mat* src, Mat* overlay, const Point& location)
{
for (int y = max(location.y, 0); y < src->rows; ++y)
{
int fY = y - location.y;
if (fY >= overlay->rows)
break;
for (int x = max(location.x, 0); x < src->cols; ++x)
{
int fX = x - location.x;
if (fX >= overlay->cols)
break;
double opacity = ((double)overlay->data[fY * overlay->step + fX * overlay->channels() + 3]) / 255;
for (int c = 0; opacity > 0 && c < src->channels(); ++c)
{
unsigned char overlayPx = overlay->data[fY * overlay->step + fX * overlay->channels() + c];
unsigned char srcPx = src->data[y * src->step + x * src->channels() + c];
src->data[y * src->step + src->channels() * x + c] = srcPx * (1. - opacity) + overlayPx * opacity;
}
}
}
}
int main( int argc, char** argv )
{
Mat overlay = imread("ZuWDz.png",IMREAD_UNCHANGED);
Mat underlay = imread("CtBAe.png",IMREAD_UNCHANGED);
if( underlay.empty() || overlay.empty() )
{
return -1;
}
overlayImage( &underlay, &overlay, Point() );
imshow("underlay result",underlay);
waitKey();
return 0;
}