Tensorflow版Faster RCNN源码解析（TFFRCNN）（10） roi_data_layer/roidb.py

本blog为github上CharlesShang/TFFRCNN版源码解析系列代码笔记

---------------个人学习笔记---------------

----------------本文作者吴疆--------------

"""Transform a roidb into a trainable roidb by adding a bunch of metadata 元数据."""

1.prepare_roidb(imdb)

roidb = imdb.roidb，增加roidb[i]中'image'(记录图像路径)、'width'(记录图像宽)、'height'（记录图像高）、'max_classes'(记录图像中各roi与哪类gt IOU最大)、'max_overlaps'（记录各roi与gt最大的IOU值）字段赋值得到('imdb')数据集所有图像的roidb，未见调用

# 增加roidb[i]中'image'(记录图像路径)、'width'(记录图像宽)、'height'（记录图像高）、
# 'max_classes'(记录该图像中各roi与哪类gt IOU最大)、'max_overlaps'（记录各roi与gt最大的IOU值）字段赋值得到数据集所有图像的roidb
def prepare_roidb(imdb):
    # imdb 数据集类
    """
    Enrich the imdb's roidb by adding some derived衍生的 quantities数目，数量 that
    are useful for training. This function precomputes the maximum
    overlap, taken over ground-truth boxes, between each ROI and
    each ground-truth box. The class with maximum overlap is also
    recorded.
    """
    # 记录数据集每张图像的size，imdb.image_path_at(i)获取第i张图像的绝对路径
    sizes = [PIL.Image.open(imdb.image_path_at(i)).size
             for i in xrange(imdb.num_images)]
    # roidb是imdb类数据集的属性/变量！！！
    roidb = imdb.roidb
    for i in xrange(len(imdb.image_index)):
        # 给roidb[i]中'image'(记录图像路径)、'width'(记录图像宽)、'height'（记录图像高）、
        # 'max_classes'(记录该图像中各roi与哪类gt IOU最大)、'max_overlaps'（记录各roi与gt最大的IOU值）字段赋值
        roidb[i]['image'] = imdb.image_path_at(i)
        roidb[i]['width'] = sizes[i][0]
        roidb[i]['height'] = sizes[i][1]
        # need gt_overlaps as a dense array for argmax
        gt_overlaps = roidb[i]['gt_overlaps'].toarray()
        # max overlap with gt over classes (columns)
        max_overlaps = gt_overlaps.max(axis=1)
        # gt class that had the max overlap  argmax()按axis轴(axis=1表示按行)返回最大值对应的索引
        max_classes = gt_overlaps.argmax(axis=1)
        roidb[i]['max_classes'] = max_classes
        roidb[i]['max_overlaps'] = max_overlaps
        # sanity checks 检查
        # max overlap of 0 => class should be zero (background)
        zero_inds = np.where(max_overlaps == 0)[0]
        # all() 函数用于判断给定的可迭代参数 iterable 中的所有元素是否都为 TRUE，如果是返回 True，否则返回 False
        assert all(max_classes[zero_inds] == 0) # 0表示bg
        # 若max_overlap > 0则对应类别应>0表示为fg
        # max overlap > 0 => class should not be zero (must be a fg class)
        nonzero_inds = np.where(max_overlaps > 0)[0]
        assert all(max_classes[nonzero_inds] != 0)

# -*- coding:utf-8 -*-
# Author: WUJiang
# 测试功能

from PIL import Image

a = Image.open(r'C:\Users\Administrator\Desktop\292.jpg')
# <class 'PIL.JpegImagePlugin.JpegImageFile'>
print(type(a))
b = a.size
# (352, 240) 而不是84480(352*240)
print(b)

numpy中对axis=0和axis=1的理解示意图

2.add_bbox_regression_targets(roidb)

调用_compute_targets(...)函数计算roi的回归目标值，并对其规范化，返回回归目标值的均值和标准差以便测试阶段使用

roidb[i]本身还包括'boxes'(源于prepare_roidb(...)函数中imdb.roidb)，增加roidb[i]中的'bbox_targets'字段

TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED = True

# roidb[i]本身还包括'boxes'(源于prepare_roidb(...)函数中imdb.roidb)
# 增加roidb[i]中的'bbox_targets'(调用_compute_targets(...)函数得到，对bbox_targets进行规范化)
def add_bbox_regression_targets(roidb):
    """
    Add information needed to train bounding-box regressors.
    For each roi find the corresponding gt box, and compute the distance. ！！！
    then normalize the distance into Gaussian by minus mean and divided by std ！！！
    """
    assert len(roidb) > 0
    assert 'max_classes' in roidb[0], 'Did you call prepare_roidb first?'

    num_images = len(roidb)
    # Infer number of classes from the number of columns in gt_overlaps
    num_classes = roidb[0]['gt_overlaps'].shape[1]
    # 遍历数据集中每张图像
    # roidb[i]本身还包括'boxes'、
    # 增加roidb[i]中的'bbox_targets'(调用_compute_targets(...)函数得到，并进行规范化)
    for im_i in xrange(num_images):
        rois = roidb[im_i]['boxes']
        max_overlaps = roidb[im_i]['max_overlaps']
        max_classes = roidb[im_i]['max_classes']
        roidb[im_i]['bbox_targets'] = \
                _compute_targets(rois, max_overlaps, max_classes)
    # 默认TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED = True 未知意义？？？？？？
    if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED:
        # Use fixed / precomputed "means" and "stds" instead of empirical values
        means = np.tile(
                np.array(cfg.TRAIN.BBOX_NORMALIZE_MEANS), (num_classes, 1))
        stds = np.tile(
                np.array(cfg.TRAIN.BBOX_NORMALIZE_STDS), (num_classes, 1))
    else:
        # Compute values needed for means and stds
        # var(x) = E(x^2) - E(x)^2
        # 默认EPS = 1e-14
        class_counts = np.zeros((num_classes, 1)) + cfg.EPS
        sums = np.zeros((num_classes, 4))
        squared_sums = np.zeros((num_classes, 4))
        for im_i in xrange(num_images):
            targets = roidb[im_i]['bbox_targets']
            # 剔除bg class
            for cls in xrange(1, num_classes):
                cls_inds = np.where(targets[:, 0] == cls)[0]
                if cls_inds.size > 0:
                    class_counts[cls] += cls_inds.size
                    sums[cls, :] += targets[cls_inds, 1:].sum(axis=0)
                    squared_sums[cls, :] += \
                            (targets[cls_inds, 1:] ** 2).sum(axis=0)
        means = sums / class_counts
        stds = np.sqrt(squared_sums / class_counts - means ** 2)
        # too small number will cause nan error
        assert np.min(stds) < 0.01, \
            'Boxes std is too small, std:{}'.format(stds)
    # rois回归目标值的均值和标准差
    print 'bbox target means:'
    print means
    print means[1:, :].mean(axis=0) # ignore bg class
    print 'bbox target stdevs:'
    print stds
    print stds[1:, :].mean(axis=0) # ignore bg class

    # Normalize targets 规范化回归目标值
    # 默认TRAIN.BBOX_NORMALIZE_TARGETS = True
    # Normalize the targets (subtract empirical mean, divide by empirical stddev)
    if cfg.TRAIN.BBOX_NORMALIZE_TARGETS:
        print "Normalizing targets"
        for im_i in xrange(num_images):
            targets = roidb[im_i]['bbox_targets']
            for cls in xrange(1, num_classes):
                cls_inds = np.where(targets[:, 0] == cls)[0]
                roidb[im_i]['bbox_targets'][cls_inds, 1:] -= means[cls, :]
                roidb[im_i]['bbox_targets'][cls_inds, 1:] /= stds[cls, :]
    else:
        print "NOT normalizing targets"
    # These values will be needed for making predictions
    # (the predicts will need to be unnormalized and uncentered)
    # 均值和标准差在测试时需要被利用，因此需返回
    return means.ravel(), stds.ravel()

3._compute_targets(rois, overlaps, labels)

计算ex_rois(max_overlap>0.5)的回归目标值targets(N*5,5=1class+4回归目标值)并返回，roidb到底来源于RPN还是gt？？？？为什么既有roi又有gt？？？ex_roi(max_overlap需大于0.5)和gt_roi(max_overlap等于1)，调用的bbox_overlaps(...)由C编译而来，非ex_roi的rois对应回归目标值全0，被add_bbox_regression_targets(...)函数调用

# 形参overlaps、labels分别为roidb中max_overlaps、max_classes
def _compute_targets(rois, overlaps, labels):
    """
    Compute bounding-box regression targets for an image.
    for each roi find the corresponding gt_box, then compute the distance.
    """
    # Indices of ground-truth ROIs  overlaps == 1表明对应roi为gt
    # roidb到底来源于RPN还是gt？？？？为什么既有roi又有gt？？？
    # ex_roi(max_overlap需大于0.5)和gt_roi(max_overlap等于1)
    gt_inds = np.where(overlaps == 1)[0]
    if len(gt_inds) == 0:
        # Bail if the image has no ground-truth ROIs
        return np.zeros((rois.shape[0], 5), dtype=np.float32)
    # Indices of examples for which we try to make predictions
    # 默认TRAIN.BBOX_THRESH = 0.5
    # Overlap required between a ROI and ground-truth box in order for that ROI to
    # be used as a bounding-box regression training example
    ex_inds = np.where(overlaps >= cfg.TRAIN.BBOX_THRESH)[0]

    # Get IoU overlap between each ex ROI and gt ROI
    # bbox_overlaps由C编译？？？计算IOU？？？
    # np.ascontiguousarray() 返回一个地址连续的数组（C order）
    ex_gt_overlaps = bbox_overlaps(
        np.ascontiguousarray(rois[ex_inds, :], dtype=np.float),
        np.ascontiguousarray(rois[gt_inds, :], dtype=np.float))
    # Find which gt ROI each ex ROI has max overlap with:
    # this will be the ex ROI's gt target
    gt_assignment = ex_gt_overlaps.argmax(axis=1)
    gt_rois = rois[gt_inds[gt_assignment], :]
    ex_rois = rois[ex_inds, :]
    # 非ex_roi的rois对应回归目标值全0
    targets = np.zeros((rois.shape[0], 5), dtype=np.float32)
    targets[ex_inds, 0] = labels[ex_inds]
    # ex_rois的回归目标值，调用bbox_transform(...)得到
    targets[ex_inds, 1:] = bbox_transform(ex_rois, gt_rois)
    return targets

来源：https://www.cnblogs.com/deeplearning1314/p/11314523.html

标签

rcnn