本blog为github上CharlesShang/TFFRCNN版源码解析系列代码笔记
---------------个人学习笔记---------------
----------------本文作者吴疆--------------
------点击此处链接至博客园原文------
""" Outputs object detection proposals by applying estimated bounding-box transformations to a set of regular boxes (called "anchors"). 根据RPN目标回归值修正anchors并做后处理输出proposals和全0batch_ind组成的blob """
1.proposal_layer(rpn_cls_prob_reshape, rpn_bbox_pred, im_info, cfg_key, _feat_stride = [16,], anchor_scales = [8, 16, 32]) 算法逻辑
调用generate_anchors(...)(generate_anchors.py中)产生9个base anchors--->
im_info = im_info[0] # 取出第一张图像更新im_info,存储该图像的宽、高和缩放因子--->
仅取出rpn_cls_prob_reshape层输出anchors属于fg的score--->
计算shifts偏移量,即在conv5_3 feature map各个位置相对于(0,0)位置(在scaled图像上)的距离,如[0,16,0,16],为什么不用2列表示,要用4列表示偏移?--->
在conv5_3 feature map各个位置利用shifts和9个base anchors产生所有anchors,计算anchors需对base anchors和shifts进行reshape,此处要用到Python的broadcast机制--->
调用bbox_transform_inv(...)(bbox_transform.py中)对所有anchors+预测得到的回归值得到proposals--->
调用clip_boxes(...)函数(bbox_transform.py中)将越界proposals限制在图像边界(原文说训练阶段,剔除越界的box;测试阶段,限制在图像边界,实际上代码表明均是限制在图像边界)--->
调用_filter_boxes(...)函数剔除尺寸小于min_size的proposals--->
按score从大到小对proposal进行排序,取前pre_nms_topN个proposals(训练12000,测试6000)--->
调用nms(...)(nms_wrapper.py中)进行nms处理,并取post_nms_topN个proposals(训练2000,测试300)--->
将proposal组成blob并返回,300*5,5=1batch_ind(全0)+(x1,y1,x2,y2)
def proposal_layer(rpn_cls_prob_reshape, rpn_bbox_pred, im_info, cfg_key, _feat_stride = [16,], anchor_scales = [8, 16, 32]): """ Parameters ---------- rpn_cls_prob_reshape: (1 , H , W , Ax2) outputs of RPN, prob of bg or fg NOTICE: the old version is ordered by (1, H, W, 2, A) !!! rpn_bbox_pred: (1 , H , W , Ax4), rgs boxes output of RPN im_info: a list of [image_height, image_width, scale_ratios] cfg_key: 'TRAIN' or 'TEST' !!! _feat_stride: the downsampling ratio of feature map to the original input image anchor_scales: the scales to the basic_anchor (basic anchor is [16, 16])!!! ---------- Returns ---------- rpn_rois : (1 x H x W x A, 5) e.g. [0, x1, y1, x2, y2] # 算法逻辑 # Algorithm: # # for each (H, W) location i # generate A anchor boxes centered on cell i # apply predicted bbox deltas at cell i to each of the A anchors # clip predicted boxes to image # remove predicted boxes with either height or width < threshold # 训练阶段:剔除越界的box 测试阶段:限制在图像边界 # sort all (proposal, score) pairs by score from highest to lowest # take top pre_nms_topN proposals before NMS # apply NMS with threshold 0.7 to remaining proposals # take after_nms_topN proposals after NMS # return the top proposals (-> RoIs top, scores top) # layer_params = yaml.load(self.param_str_) # gt_data_layer/layer.py存在读取param_str_操作 """ # anchor_scales = [8, 16, 32] # 在conv5_3得到的feature map映射到原图的第一个位置产生9个base anchors _anchors = generate_anchors(scales=np.array(anchor_scales)) _num_anchors = _anchors.shape[0] # rpn_cls_prob_reshape = np.transpose(rpn_cls_prob_reshape,[0,3,1,2]) #-> (1 , 2xA, H , W) # rpn_bbox_pred = np.transpose(rpn_bbox_pred,[0,3,1,2]) # -> (1 , Ax4, H , W) #rpn_cls_prob_reshape = np.transpose(np.reshape(rpn_cls_prob_reshape,[1,rpn_cls_prob_reshape.shape[0],rpn_cls_prob_reshape.shape[1],rpn_cls_prob_reshape.shape[2]]),[0,3,2,1]) #rpn_bbox_pred = np.transpose(rpn_bbox_pred,[0,3,2,1]) im_info = im_info[0] # 第一张图像?????? assert rpn_cls_prob_reshape.shape[0] == 1, \ 'Only single item batches are supported' # cfg_key = str(self.phase) # either 'TRAIN' or 'TEST' # cfg_key = 'TEST' pre_nms_topN = cfg[cfg_key].RPN_PRE_NMS_TOP_N # 12000/6000 post_nms_topN = cfg[cfg_key].RPN_POST_NMS_TOP_N # 2000/300 nms_thresh = cfg[cfg_key].RPN_NMS_THRESH # 均为0.7 min_size = cfg[cfg_key].RPN_MIN_SIZE # proposal在原始图片中的最小尺寸,均为16 height, width = rpn_cls_prob_reshape.shape[1:3] # conv5_3 feature map的 H 和 W # the first set of _num_anchors channels are bg probs # the second set are the fg probs, which we want # (1 , H , W , Ax2)-----(1, H, W, A,2)-----(1, H, W, A) # 得到所有anchors属于fg的score!!! scores = np.reshape(np.reshape(rpn_cls_prob_reshape, [1, height, width, _num_anchors, 2])[:,:,:,:,1], [1, height, width, _num_anchors]) # TODO: NOTICE: the old version is ordered by (1, H, W, 2, A) !!!! # TODO: if you use the old trained model, VGGnet_fast_rcnn_iter_70000.ckpt, uncomment this line # scores = rpn_cls_prob_reshape[:,:,:,_num_anchors:] bbox_deltas = rpn_bbox_pred #im_info = bottom[2].data[0, :] # 默认DEBUG = False if DEBUG: print 'im_size: ({}, {})'.format(im_info[0], im_info[1]) print 'scale: {}'.format(im_info[2]) # 1. Generate proposals from bbox deltas and shifted anchors if DEBUG: print 'score map size: {}'.format(scores.shape) # 在原图像中16*16的像素块中找9个比例大小的anchor,要定位anchor在原图像的位置,只需定义左上角16*16区域所形成的9个anchor相对于其他16*16区域anchor的偏移量 # Enumerate all shifts # 各位置在原图像中的相对(0,0)位置在两个方向的偏移量 shift_x = np.arange(0, width) * _feat_stride shift_y = np.arange(0, height) * _feat_stride # np.meshgrid()函数将参数1当做第1个结果的每一行, 并且一共有参数2的长度个行 # 同时, 第2个结果的每一列为参数2的内容, 并且重复参数1的长度个列 shift_x, shift_y = np.meshgrid(shift_x, shift_y) # ravel()将多维数组转换为1维数组 # 得到conv5_3 feature map各个位置相对于(0,0)的偏移量,比如左上第一个位置偏移量为[0, 0, 0, 0]、第二个位置为[16, 0, 16, 0] # 为什么不用两列表示,要用四列表示偏移??? # shifts.shape = (width*height,4) shifts = np.vstack((shift_x.ravel(), shift_y.ravel(), shift_x.ravel(), shift_y.ravel())).transpose() # Enumerate all shifted anchors: # # add A anchors (1, A, 4) to # cell K shifts (K, 1, 4) to get # shift anchors (K, A, 4) # reshape to (K*A, 4) shifted anchors A = _num_anchors # 各个位置上的锚点个数9 K = shifts.shape[0] # feature map(width*height)个位置 # _anchors中记录的是9个base anchors左上、右下坐标值 # Python中的broadcast机制 anchors = _anchors.reshape((1, A, 4)) + \ shifts.reshape((1, K, 4)).transpose((1, 0, 2)) # 在conv5_3 feature map各个位置上产生9个anchors(scaled图像上的坐标值) anchors = anchors.reshape((K * A, 4)) # Transpose and reshape predicted bbox transformations to get them # into the same order as the anchors: # bbox deltas will be (1, 4 * A, H, W) format # transpose to (1, H, W, 4 * A) # reshape to (1 * H * W * A, 4) where rows are ordered by (h, w, a) # in slowest to fastest order # 即rpn_bbox_pred bbox_deltas = bbox_deltas.reshape((-1, 4)) # Same story for the scores: # scores are (1, A, H, W) format # transpose to (1, H, W, A) # reshape to (1 * H * W * A, 1) where rows are ordered by (h, w, a) scores = scores.reshape((-1, 1)) # 1.Convert anchors into proposals via bbox transformations # 锚点坐标信息+预测坐标回归值得到proposal在scaled图像中的坐标信息 proposals = bbox_transform_inv(anchors, bbox_deltas) # 2. clip predicted boxes to image 将proposal限制到图像边界 proposals = clip_boxes(proposals, im_info[:2]) # 3. remove predicted boxes with either height or width < threshold # (NOTE: convert min_size to input image scale stored in im_info[2]) # proposals尺寸应大于规定的最小size(返回对应索引),im_info[2]为该图像缩放因子 keep = _filter_boxes(proposals, min_size * im_info[2]) proposals = proposals[keep, :] scores = scores[keep] # remove irregular boxes, too fat too tall # keep = _filter_irregular_boxes(proposals) # proposals = proposals[keep, :] # scores = scores[keep] # 4. sort all (proposal, score) pairs by score from highest to lowest # 5. take top pre_nms_topN (e.g. 6000) # argsort()返回的是得分从小到大的索引,[::-1]是反序排列,因此order为从大到小的索引 # scores为各proposal属于fg的score # 排序可能比较耗时!!! order = scores.ravel().argsort()[::-1] if pre_nms_topN > 0: order = order[:pre_nms_topN] # 12000/6000 前pre_nms_topN个引索值 proposals = proposals[order, :] scores = scores[order] # 6. apply nms (e.g. threshold = 0.7) # 7. take after_nms_topN (e.g. 300) # 8. return the top proposals (-> RoIs top) # proposals, scores横向拼接构成dets,score仅占一列,表示属于fg的score keep = nms(np.hstack((proposals, scores)), nms_thresh) if post_nms_topN > 0: # 2000/300 keep = keep[:post_nms_topN] # 获取nms后的索引 proposals = proposals[keep, :] scores = scores[keep] # 保存nms后的proposal和对应的score # Output rois blob # Our RPN implementation only supports a single input image, so all # batch inds are 0 # 建立proposal的batch索引全0 proposals.shape[0]为proposal个数 batch_inds = np.zeros((proposals.shape[0], 1), dtype=np.float32) # 生成blob[全0引索,proposal]构成,(proposal.shape[0],5)!!! blob = np.hstack((batch_inds, proposals.astype(np.float32, copy=False))) return blob # top[0].reshape(*(blob.shape)) # top[0].data[...] = blob # [Optional] output scores blob # if len(top) > 1: # top[1].reshape(*(scores.shape)) # top[1].data[...] = scores
# -*- coding:utf-8 -*- # Author: WUJiang # 测试功能 # np.meshgrid()函数和np.ravel() import numpy as np shift_x = np.arange(0, 4) shift_y = np.arange(1, 5) # np.meshgrid(参数1,参数2) # np.meshgrid()函数将参数1当做第1个结果的每一行, 并且一共有参数2的长度个行 # 同时, 第2个结果的每一列为参数2的内容, 并且重复参数1的长度个列 shift_x, shift_y = np.meshgrid(shift_x, shift_y) """ [[0 1 2 3] [0 1 2 3] [0 1 2 3] [0 1 2 3]] """ print(shift_x) # [0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3] # 将多维降为一维 print(shift_x.ravel()) """ [[1 1 1 1] [2 2 2 2] [3 3 3 3] [4 4 4 4]] """ print(shift_y) # [1 1 1 1 2 2 2 2 3 3 3 3 4 4 4 4] print(shift_y.ravel())
# -*- coding:utf-8 -*- # Author: WUJiang # 测试功能 # python中broadcast机制 import numpy as np a = np.array([ [1, 2, 3, 4], [2, 5, 7, 6], ]) b = np.array([ [5, 2, 3, 4], [2, 7, 7, 6], [9, 1, 2, 5] ]) # error:operands could not be broadcast together with shapes (2,4) (3,4) # print(a+b) # shape = (3, 1, 4) print(b.reshape(1, 3, 4).transpose((1, 0, 2)).shape) """ [[[ 6 4 6 8] [ 7 7 10 10]] [[ 3 9 10 10] [ 4 12 14 12]] [[10 3 5 9] [11 6 9 11]]] """ print(a.reshape(1, 2, 4) + b.reshape(1, 3, 4).transpose((1, 0, 2)))
2._filter_boxes(boxes,min_size)
过滤尺寸小于min_size的proposal,并返回相应索引,被proposal_layer(...)函数调用
# 过滤尺寸小于min_size的proposal def _filter_boxes(boxes, min_size): """Remove all boxes with any side smaller than min_size.""" ws = boxes[:, 2] - boxes[:, 0] + 1 # proposal的宽 hs = boxes[:, 3] - boxes[:, 1] + 1 # proposal的高 # 将尺寸大于最低要求的proposal对应索引存入keep返回 keep = np.where((ws >= min_size) & (hs >= min_size))[0] return keep
3._filter_irregular_boxes(boxes, min_ratio = 0.2, max_ratio = 5)
过滤纵横比不在规定区间的proposal,并返回相应索引,被proposal_layer(...)函数注释调用
# 过滤纵横比<0.2或>0.5的proposal def _filter_irregular_boxes(boxes, min_ratio = 0.2, max_ratio = 5): """Remove all boxes with any side smaller than min_size.""" ws = boxes[:, 2] - boxes[:, 0] + 1 hs = boxes[:, 3] - boxes[:, 1] + 1 rs = ws / hs keep = np.where((rs <= max_ratio) & (rs >= min_ratio))[0] return keep