Tensorflow版Faster RCNN源码解析（TFFRCNN）（17） rpn_msr/proposal_layer_tf.py

本blog为github上CharlesShang/TFFRCNN版源码解析系列代码笔记

---------------个人学习笔记---------------

----------------本文作者吴疆--------------

"""
Outputs object detection proposals by applying estimated bounding-box
transformations to a set of regular boxes (called "anchors").
根据RPN目标回归值修正anchors并做后处理输出proposals和全0batch_ind组成的blob
"""

1.proposal_layer(rpn_cls_prob_reshape, rpn_bbox_pred, im_info, cfg_key, _feat_stride = [16,], anchor_scales = [8, 16, 32]) 算法逻辑

调用generate_anchors(...)(generate_anchors.py中)产生9个base anchors--->

im_info = im_info[0] # 取出第一张图像更新im_info，存储该图像的宽、高和缩放因子--->

仅取出rpn_cls_prob_reshape层输出anchors属于fg的score--->

计算shifts偏移量，即在conv5_3 feature map各个位置相对于(0,0)位置（在scaled图像上）的距离，如[0,16,0,16]，为什么不用2列表示，要用4列表示偏移？--->

在conv5_3 feature map各个位置利用shifts和9个base anchors产生所有anchors，计算anchors需对base anchors和shifts进行reshape，此处要用到Python的broadcast机制--->

调用bbox_transform_inv(...)（bbox_transform.py中）对所有anchors+预测得到的回归值得到proposals--->

调用clip_boxes(...)函数（bbox_transform.py中）将越界proposals限制在图像边界（原文说训练阶段，剔除越界的box；测试阶段，限制在图像边界，实际上代码表明均是限制在图像边界）--->

调用_filter_boxes(...)函数剔除尺寸小于min_size的proposals--->

按score从大到小对proposal进行排序，取前pre_nms_topN个proposals(训练12000，测试6000)--->

调用nms(...)（nms_wrapper.py中）进行nms处理，并取post_nms_topN个proposals（训练2000，测试300）--->

将proposal组成blob并返回，300*5,5=1batch_ind（全0）+（x1,y1,x2,y2）

def proposal_layer(rpn_cls_prob_reshape, rpn_bbox_pred, im_info, cfg_key, _feat_stride = [16,], anchor_scales = [8, 16, 32]):
    """
    Parameters
    ----------
    rpn_cls_prob_reshape: (1 , H , W , Ax2) outputs of RPN, prob of bg or fg
                         NOTICE: the old version is ordered by (1, H, W, 2, A) ！！！
    rpn_bbox_pred: (1 , H , W , Ax4), rgs boxes output of RPN
    im_info: a list of [image_height, image_width, scale_ratios]
    cfg_key: 'TRAIN' or 'TEST' ！！！
    _feat_stride: the downsampling ratio of feature map to the original input image
    anchor_scales: the scales to the basic_anchor (basic anchor is [16, 16])！！！
    ----------
    Returns
    ----------
    rpn_rois : (1 x H x W x A, 5) e.g. [0, x1, y1, x2, y2]

    # 算法逻辑
    # Algorithm:
    #
    # for each (H, W) location i
    #   generate A anchor boxes centered on cell i
    #   apply predicted bbox deltas at cell i to each of the A anchors
    # clip predicted boxes to image
    # remove predicted boxes with either height or width < threshold    # 训练阶段：剔除越界的box  测试阶段：限制在图像边界
    # sort all (proposal, score) pairs by score from highest to lowest  
    # take top pre_nms_topN proposals before NMS
    # apply NMS with threshold 0.7 to remaining proposals
    # take after_nms_topN proposals after NMS
    # return the top proposals (-> RoIs top, scores top)
    # layer_params = yaml.load(self.param_str_)       # gt_data_layer/layer.py存在读取param_str_操作
    """
    # anchor_scales = [8, 16, 32]
    # 在conv5_3得到的feature map映射到原图的第一个位置产生9个base anchors
    _anchors = generate_anchors(scales=np.array(anchor_scales))
    _num_anchors = _anchors.shape[0]
    # rpn_cls_prob_reshape = np.transpose(rpn_cls_prob_reshape,[0,3,1,2]) #-> (1 , 2xA, H , W)
    # rpn_bbox_pred = np.transpose(rpn_bbox_pred,[0,3,1,2])              # -> (1 , Ax4, H , W)
    #rpn_cls_prob_reshape = np.transpose(np.reshape(rpn_cls_prob_reshape,[1,rpn_cls_prob_reshape.shape[0],rpn_cls_prob_reshape.shape[1],rpn_cls_prob_reshape.shape[2]]),[0,3,2,1])
    #rpn_bbox_pred = np.transpose(rpn_bbox_pred,[0,3,2,1])

    im_info = im_info[0]   # 第一张图像？？？？？？
    assert rpn_cls_prob_reshape.shape[0] == 1, \
        'Only single item batches are supported'
    # cfg_key = str(self.phase) # either 'TRAIN' or 'TEST'
    # cfg_key = 'TEST'
    pre_nms_topN  = cfg[cfg_key].RPN_PRE_NMS_TOP_N   # 12000/6000
    post_nms_topN = cfg[cfg_key].RPN_POST_NMS_TOP_N  # 2000/300
    nms_thresh    = cfg[cfg_key].RPN_NMS_THRESH      # 均为0.7
    min_size      = cfg[cfg_key].RPN_MIN_SIZE        # proposal在原始图片中的最小尺寸，均为16
    height, width = rpn_cls_prob_reshape.shape[1:3]  # conv5_3 feature map的 H 和 W

    # the first set of _num_anchors channels are bg probs
    # the second set are the fg probs, which we want
    # (1 , H , W , Ax2)-----(1, H, W, A,2)-----(1, H, W, A)
    # 得到所有anchors属于fg的score！！！
    scores = np.reshape(np.reshape(rpn_cls_prob_reshape, [1, height, width, _num_anchors, 2])[:,:,:,:,1],
                        [1, height, width, _num_anchors])

    # TODO: NOTICE: the old version is ordered by (1, H, W, 2, A) !!!!
    # TODO: if you use the old trained model, VGGnet_fast_rcnn_iter_70000.ckpt, uncomment this line
    # scores = rpn_cls_prob_reshape[:,:,:,_num_anchors:]

    bbox_deltas = rpn_bbox_pred
    #im_info = bottom[2].data[0, :]
    # 默认DEBUG = False
    if DEBUG:
        print 'im_size: ({}, {})'.format(im_info[0], im_info[1])
        print 'scale: {}'.format(im_info[2])

    # 1. Generate proposals from bbox deltas and shifted anchors
    if DEBUG:
        print 'score map size: {}'.format(scores.shape)

    # 在原图像中16*16的像素块中找9个比例大小的anchor,要定位anchor在原图像的位置，只需定义左上角16*16区域所形成的9个anchor相对于其他16*16区域anchor的偏移量
    # Enumerate all shifts
    # 各位置在原图像中的相对（0,0）位置在两个方向的偏移量
    shift_x = np.arange(0, width) * _feat_stride
    shift_y = np.arange(0, height) * _feat_stride
    # np.meshgrid()函数将参数1当做第1个结果的每一行, 并且一共有参数2的长度个行
    # 同时, 第2个结果的每一列为参数2的内容, 并且重复参数1的长度个列
    shift_x, shift_y = np.meshgrid(shift_x, shift_y)
    # ravel()将多维数组转换为1维数组
    # 得到conv5_3 feature map各个位置相对于(0,0)的偏移量，比如左上第一个位置偏移量为[0, 0, 0, 0]、第二个位置为[16, 0, 16, 0]
    # 为什么不用两列表示，要用四列表示偏移？？？
    # shifts.shape = (width*height,4)
    shifts = np.vstack((shift_x.ravel(), shift_y.ravel(),
                        shift_x.ravel(), shift_y.ravel())).transpose()
    # Enumerate all shifted anchors:
    #
    # add A anchors (1, A, 4) to
    # cell K shifts (K, 1, 4) to get
    # shift anchors (K, A, 4)
    # reshape to (K*A, 4) shifted anchors
    A = _num_anchors      # 各个位置上的锚点个数9
    K = shifts.shape[0]   # feature map(width*height)个位置
    # _anchors中记录的是9个base anchors左上、右下坐标值
    # Python中的broadcast机制
    anchors = _anchors.reshape((1, A, 4)) + \
              shifts.reshape((1, K, 4)).transpose((1, 0, 2))
    # 在conv5_3 feature map各个位置上产生9个anchors(scaled图像上的坐标值)
    anchors = anchors.reshape((K * A, 4))

    # Transpose and reshape predicted bbox transformations to get them
    # into the same order as the anchors:
    # bbox deltas will be (1, 4 * A, H, W) format
    # transpose to (1, H, W, 4 * A)
    # reshape to (1 * H * W * A, 4) where rows are ordered by (h, w, a)
    # in slowest to fastest order
    # 即rpn_bbox_pred
    bbox_deltas = bbox_deltas.reshape((-1, 4))

    # Same story for the scores:
    # scores are (1, A, H, W) format
    # transpose to (1, H, W, A)
    # reshape to (1 * H * W * A, 1) where rows are ordered by (h, w, a)
    scores = scores.reshape((-1, 1))

    # 1.Convert anchors into proposals via bbox transformations
    # 锚点坐标信息+预测坐标回归值得到proposal在scaled图像中的坐标信息
    proposals = bbox_transform_inv(anchors, bbox_deltas)

    # 2. clip predicted boxes to image   将proposal限制到图像边界
    proposals = clip_boxes(proposals, im_info[:2])

    # 3. remove predicted boxes with either height or width < threshold
    # (NOTE: convert min_size to input image scale stored in im_info[2])
    # proposals尺寸应大于规定的最小size（返回对应索引），im_info[2]为该图像缩放因子
    keep = _filter_boxes(proposals, min_size * im_info[2])
    proposals = proposals[keep, :]
    scores = scores[keep]

    # remove irregular boxes, too fat too tall
    # keep = _filter_irregular_boxes(proposals)
    # proposals = proposals[keep, :]
    # scores = scores[keep]

    # 4. sort all (proposal, score) pairs by score from highest to lowest
    # 5. take top pre_nms_topN (e.g. 6000)
    # argsort()返回的是得分从小到大的索引，[::-1]是反序排列，因此order为从大到小的索引
    # scores为各proposal属于fg的score
    # 排序可能比较耗时！！！
    order = scores.ravel().argsort()[::-1]
    if pre_nms_topN > 0:
        order = order[:pre_nms_topN]    # 12000/6000  前pre_nms_topN个引索值
    proposals = proposals[order, :]
    scores = scores[order]

    # 6. apply nms (e.g. threshold = 0.7)
    # 7. take after_nms_topN (e.g. 300)
    # 8. return the top proposals (-> RoIs top)
    # proposals, scores横向拼接构成dets，score仅占一列，表示属于fg的score
    keep = nms(np.hstack((proposals, scores)), nms_thresh)
    if post_nms_topN > 0:   # 2000/300
        keep = keep[:post_nms_topN]    # 获取nms后的索引
    proposals = proposals[keep, :]
    scores = scores[keep]             # 保存nms后的proposal和对应的score

    # Output rois blob
    # Our RPN implementation only supports a single input image, so all
    # batch inds are 0
    # 建立proposal的batch索引全0  proposals.shape[0]为proposal个数
    batch_inds = np.zeros((proposals.shape[0], 1), dtype=np.float32)
    # 生成blob[全0引索，proposal]构成，（proposal.shape[0],5）！！！
    blob = np.hstack((batch_inds, proposals.astype(np.float32, copy=False)))
    return blob
    # top[0].reshape(*(blob.shape))
    # top[0].data[...] = blob

    # [Optional] output scores blob
    # if len(top) > 1:
    #    top[1].reshape(*(scores.shape))
    #    top[1].data[...] = scores

# -*- coding:utf-8 -*-
# Author: WUJiang
# 测试功能
# np.meshgrid()函数和np.ravel()

import numpy as np

shift_x = np.arange(0, 4)
shift_y = np.arange(1, 5)
# np.meshgrid(参数1，参数2)
# np.meshgrid()函数将参数1当做第1个结果的每一行, 并且一共有参数2的长度个行
# 同时, 第2个结果的每一列为参数2的内容, 并且重复参数1的长度个列
shift_x, shift_y = np.meshgrid(shift_x, shift_y)
"""
[[0 1 2 3]
 [0 1 2 3]
 [0 1 2 3]
 [0 1 2 3]]
"""
print(shift_x)
# [0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3]
# 将多维降为一维
print(shift_x.ravel())
"""
[[1 1 1 1]
 [2 2 2 2]
 [3 3 3 3]
 [4 4 4 4]]
"""
print(shift_y)
# [1 1 1 1 2 2 2 2 3 3 3 3 4 4 4 4]
print(shift_y.ravel())

# -*- coding:utf-8 -*-
# Author: WUJiang
# 测试功能
# python中broadcast机制

import numpy as np

a = np.array([
    [1, 2, 3, 4],
    [2, 5, 7, 6],
])
b = np.array([
    [5, 2, 3, 4],
    [2, 7, 7, 6],
    [9, 1, 2, 5]
])
# error:operands could not be broadcast together with shapes (2,4) (3,4)
# print(a+b)
# shape = (3, 1, 4)
print(b.reshape(1, 3, 4).transpose((1, 0, 2)).shape)
"""
[[[ 6  4  6  8]
  [ 7  7 10 10]]

 [[ 3  9 10 10]
  [ 4 12 14 12]]

 [[10  3  5  9]
  [11  6  9 11]]]
"""
print(a.reshape(1, 2, 4) + b.reshape(1, 3, 4).transpose((1, 0, 2)))

2._filter_boxes(boxes,min_size)

过滤尺寸小于min_size的proposal，并返回相应索引，被proposal_layer(...)函数调用

# 过滤尺寸小于min_size的proposal
def _filter_boxes(boxes, min_size):
    """Remove all boxes with any side smaller than min_size."""
    ws = boxes[:, 2] - boxes[:, 0] + 1     # proposal的宽
    hs = boxes[:, 3] - boxes[:, 1] + 1     # proposal的高
    # 将尺寸大于最低要求的proposal对应索引存入keep返回
    keep = np.where((ws >= min_size) & (hs >= min_size))[0]
    return keep

3._filter_irregular_boxes(boxes, min_ratio = 0.2, max_ratio = 5)

过滤纵横比不在规定区间的proposal，并返回相应索引，被proposal_layer(...)函数注释调用

# 过滤纵横比<0.2或>0.5的proposal
def _filter_irregular_boxes(boxes, min_ratio = 0.2, max_ratio = 5):
    """Remove all boxes with any side smaller than min_size."""
    ws = boxes[:, 2] - boxes[:, 0] + 1
    hs = boxes[:, 3] - boxes[:, 1] + 1
    rs = ws / hs
    keep = np.where((rs <= max_ratio) & (rs >= min_ratio))[0]
    return keep

来源：https://www.cnblogs.com/deeplearning1314/p/11341824.html

标签

rcnn

cls

info

bbox

tensorflow

Tensorflow版Faster RCNN源码解析（TFFRCNN） （17） rpn_msr/proposal_layer_tf.py

Tensorflow版Faster RCNN源码解析（TFFRCNN）（17） rpn_msr/proposal_layer_tf.py