Tensorflow版Faster RCNN源码解析（TFFRCNN）（11） gt_data_layer/minibatch.py

本blog为github上CharlesShang/TFFRCNN版源码解析系列代码笔记

---------------个人学习笔记---------------

----------------本文作者吴疆--------------

与roi_data_layer/minibatch.py类似，该函数可能并未执行

"""Compute minibatch blobs for training a Fast R-CNN network."""

1.get_minibatch(roidb, num_classes)

更新roidb[i]'info_boxes'字段（未知内容，18是什么意思）、增加'data'（图像数据blob）和'parameters'字段（相关参数，含num_scale 图像缩放尺度数量、num_aspect 使用纵横比数量、cfg.TRAIN.SCALES、cfg.TRAIN.SCALE_MAPPING、cfg.TRAIN.ASPECT_HEIGHTS、cfg.TRAIN.ASPECT_WIDTHS，后面3个值均无应该会报错，也有可能该函数并未执行），未见调用

# 更新roidb[i]'info_boxes'字段、增加'data'和'parameters'字段
def get_minibatch(roidb, num_classes):
    """Given a roidb, construct a minibatch sampled from it."""
    num_images = len(roidb)
    # 默认TRAIN.BATCH_SIZE = 128
    assert(cfg.TRAIN.BATCH_SIZE % num_images == 0), \
        'num_images ({}) must divide BATCH_SIZE ({})'. \
        format(num_images, cfg.TRAIN.BATCH_SIZE)
    # Get the input image blob, formatted for caffe
    im_blob = _get_image_blob(roidb)

    # build the box information blob
    # 这里的18写死，指的是什么？？？
    info_boxes_blob = np.zeros((0, 18), dtype=np.float32)
    # 默认TRAIN.SCALES = (600,)
    num_scale = len(cfg.TRAIN.SCALES)
    for i in xrange(num_images):
        info_boxes = roidb[i]['info_boxes']
        # change the batch index
        # 为何要这样处理？？？第3、8列各自代表什么？？？
        info_boxes[:,2] += i * num_scale
        info_boxes[:,7] += i * num_scale
        info_boxes_blob = np.vstack((info_boxes_blob, info_boxes))

    # build the parameter blob
    # 默认TRAIN.ASPECTS= (1,)仅一个？？？  (Aspect ratio to use during training)
    num_aspect = len(cfg.TRAIN.ASPECTS)
    num = 2 + 2 * num_scale + 2 * num_aspect   # 6？
    # parameters_blob存储以下参数
    # num_scale 图像缩放尺度数量  len(cfg.TRAIN.SCALES) = 1
    # num_aspect 使用纵横比数量   len(cfg.TRAIN.ASPECTS) = 1
    # cfg.TRAIN.SCALES  (600,)
    # cfg.TRAIN.SCALE_MAPPING  无该值 按理会触发error？？？或许该函数未被调用
    # cfg.TRAIN.ASPECT_HEIGHTS 无该值 按理会触发error？？？
    # cfg.TRAIN.ASPECT_WIDTHS  无该值 按理会触发error？？？
    parameters_blob = np.zeros((num), dtype=np.float32)
    parameters_blob[0] = num_scale
    parameters_blob[1] = num_aspect
    parameters_blob[2:2+num_scale] = cfg.TRAIN.SCALES
    parameters_blob[2+num_scale:2+2*num_scale] = cfg.TRAIN.SCALE_MAPPING
    parameters_blob[2+2*num_scale:2+2*num_scale+num_aspect] = cfg.TRAIN.ASPECT_HEIGHTS
    parameters_blob[2+2*num_scale+num_aspect:2+2*num_scale+2*num_aspect] = cfg.TRAIN.ASPECT_WIDTHS
    # For debug visualizations
    # _vis_minibatch(im_blob, rois_blob, labels_blob, sublabels_blob)
    blobs = {'data': im_blob,
             'info_boxes': info_boxes_blob,
             'parameters': parameters_blob}
    return blobs

2._get_image_blob(roidb)

对传入的roidb中图像减均值、缩放处理，得到处理后的图像存储到processes_ims列表中，将其作为参数传入im_list_to_blob(...)函数中返回图像数据blob，被get_minibatch(...)函数调用，构成blobs中的‘data’字段

与roi_data_layer/minibatch.py（仅使用了target_size单一尺度进行缩放）此函数的区别在于缩放使用了多尺度TRAIN.SCALES_BASE = (0.25, 0.5, 1.0, 2.0, 3.0)，为何要使用多尺度？未见调用

def _get_image_blob(roidb):
    """Builds an input blob from the images in the roidb at the different scales."""
    num_images = len(roidb)
    # 存储缩放后的图像构成的列表，作为参数传入im_list_to_blob(...)函数得到图像数据blob
    processed_ims = []
    for i in xrange(num_images):
        # read image
        im = cv2.imread(roidb[i]['image'])
        if roidb[i]['flipped']:
            im = im[:, ::-1, :]
        im_orig = im.astype(np.float32, copy=True)
        im_orig -= cfg.PIXEL_MEANS
        # build image pyramid
        # 与roi_data_layer/minibatch.py中_get_image_blob(...)区别在此！！！
        # 默认TRAIN.SCALES_BASE = (0.25, 0.5, 1.0, 2.0, 3.0)
        # 为何这里使用了多尺度？？？(Scales to compute real features)
        for im_scale in cfg.TRAIN.SCALES_BASE:
            im = cv2.resize(im_orig, None, None, fx=im_scale, fy=im_scale,
                        interpolation=cv2.INTER_LINEAR)
            processed_ims.append(im)
    # Create a blob to hold the input images，blob.py中
    blob = im_list_to_blob(processed_ims)
    return blob

3._project_image_blob(im_rois, im_scale_factor)

对rois进行缩放，未见调用

def _project_im_rois(im_rois, im_scale_factor):
    """Project image RoIs into the rescaled training image."""
    rois = im_rois * im_scale_factor
    return rois

4._get_bbox_regression_labels(bbox_target_data, num_classes)

扩充N*5 bbox_targets to N*(4*num_classes)仅某类有非0的回归目标值（网络接受的shape）、构造N*(4*num_classes)的bbox_loss_weights，返回 bbox_targets和bbox_loss_weights，未见调用

# 扩充N*5 bbox_targets to N*(4*num_classes)仅某类有非0的回归目标值
# 构造N*(4*num_classes)的bbox_loss_weights
def _get_bbox_regression_labels(bbox_target_data, num_classes):
    """
    Bounding-box regression targets are stored in a compact紧密的，紧凑的 form in the roidb.
    This function expands those targets into the 4-of-4*K representation used
    by the network (i.e. only one class has non-zero targets). The loss weights
    are similarly expanded.
    Returns:
        bbox_target_data (ndarray): N x 4K blob of regression targets
        bbox_loss_weights (ndarray): N x 4K blob of loss weights
    """
    clss = bbox_target_data[:, 0]
    bbox_targets = np.zeros((clss.size, 4 * num_classes), dtype=np.float32)
    bbox_loss_weights = np.zeros(bbox_targets.shape, dtype=np.float32)
    inds = np.where(clss > 0)[0]  # 剔除bg
    for ind in inds:
        cls = clss[ind]
        start = 4 * cls
        end = start + 4
        # 扩充N*5 bbox_targets to N*(4*num_classes)仅某类有非0的回归目标值
        bbox_targets[ind, start:end] = bbox_target_data[ind, 1:]
        # shape为N*(4*num_classes)，仅某类对应值为1 1 1 1，其余全0
        bbox_loss_weights[ind, start:end] = [1., 1., 1., 1.]
    return bbox_targets, bbox_loss_weights

5._vis_minibatch(im_blob, rois_blob, labels_blob, sublabels_blob)

绘制roi矩形框，打印相关信息，未见调用

# 绘制roi矩形框，打印相关信息
def _vis_minibatch(im_blob, rois_blob, labels_blob, sublabels_blob):
    """Visualize a mini-batch for debugging."""
    import matplotlib.pyplot as plt
    for i in xrange(rois_blob.shape[0]):
        # 1(roi来源索引)+4(roi坐标)
        rois = rois_blob[i, :]
        # 该roi来源图像索引 
        im_ind = rois[0]
        roi = rois[2:]
        im = im_blob[im_ind, :, :, :].transpose((1, 2, 0)).copy()
        im += cfg.PIXEL_MEANS
        im = im[:, :, (2, 1, 0)]
        im = im.astype(np.uint8)
        cls = labels_blob[i]
        subcls = sublabels_blob[i]
        plt.imshow(im)
        print 'class: ', cls, ' subclass: ', subcls
        plt.gca().add_patch(
            plt.Rectangle((roi[0], roi[1]), roi[2] - roi[0],
                          roi[3] - roi[1], fill=False,
                          edgecolor='r', linewidth=3)
            )
        plt.show()

来源：https://www.cnblogs.com/deeplearning1314/p/11325018.html

标签

rcnn