本文不阐述各个知识点具体内容,只给出代码实现和理解,其中涉及到的知识点如下
- matplotlib画图
- 锚框的概念和实现
- 交并比
- 偏移量
- 非极大值抑制
目标检测和边框值
import d2lzh as d2l from mxnet import image d2l.set_figsize() img = image.imread('img/catdog.jpg').asnumpy() # 边界框(bounding_box) dog_bbox,cat_bbox = [60,45,378,516],[400,112,655,493] # bbox_to_reat 将边界框表示成matplotlib的边界形式 def bbox_to_rect(bbox,color): # 左上x,左上y,右下x,右下y # ((左上x,左上y),宽,高) return d2l.plt.Rectangle( xy=(bbox[0],bbox[1]),width=bbox[2]-bbox[0],height=bbox[3]-bbox[1], fill=False,edgecolor=color,linewidth=2 # 不填充,边界颜色color,线宽2 ) fig = d2l.plt.imshow(img) fig.axes.add_patch(bbox_to_rect(dog_bbox,'blue')) fig.axes.add_patch(bbox_to_rect(cat_bbox,'red'))
锚框
以每个像素的中心生成多个大小和宽高比不同的边界框。这些边界框称为锚框
from mxnet import image,contrib,gluon,nd import numpy as np import d2lzh as d2l np.set_printoptions(2) img = image.imread('img/catdog.jpg').asnumpy() h,w = img.shape[0:2] #高和宽 print(h,w) X = nd.random.uniform(shape=(1,3,h,w)) Y = contrib.nd.MultiBoxPrior(X,sizes=[0.75,0.5,0.25],ratios=[1,2,0.5]) # 一个像素5个锚框 Y.shape #(1,2042040,4) boxes = Y.reshape((h,w,5,4)) print(boxes[250,250,:,:]) def show_bboxes(axes,bboxes,labels=None,colors=None): def _make_list(obj,default_values=None): if obj is None: obj = default_values elif not isinstance(obj,(list,tuple)): obj = [obj] return obj labels = _make_list(labels) #['s=0.75,r=1', 's=0.5,r=1', 's=0.25,r=1', 's=0.75,r=2', 's=0.75,r=0.5'] colors = _make_list(colors,['b','g','r','m','c']) #['b', 'g', 'r', 'm', 'c'] for i,bbox in enumerate(bboxes): color = colors[i%len(colors)] rect = d2l.bbox_to_rect(bbox.asnumpy(),color) axes.add_patch(rect) print(i) if labels and len(labels)>i: #个数判定 text_color = 'k' if color == 'w' else 'w' # 因为是颜色,w是白色 #增加外框,如文字说名. axes.text(rect.xy[0],rect.xy[1],labels[i],va='center',ha='center', fontsize=9,color=text_color,bbox=dict(facecolor=color,lw=0)) d2l.set_figsize() bbox_scale = nd.array((w,h,w,h)) fig = d2l.plt.imshow(img) # 因为x轴和y轴坐标除了宽和高,这里需要还原. show_bboxes(fig.axes,boxes[250,250,:,:]*bbox_scale,['s=0.75,r=1','s=0.5,r=1', 's=0.25,r=1','s=0.75,r=2', 's=0.75,r=0.5']) d2l.plt.show()
# 在上面的基础上,注释掉show_bboxes(fig.axes,boxes[250,250,:,:]*bbox_scale,['s=0.75,r=1','s=0.5,r=1', # 's=0.25,r=1','s=0.75,r=2', # 's=0.75,r=0.5']) #In[6] # 第一个元素是类别,0为狗,1为猫,剩下4个是锚框坐标 ground_truth = nd.array([[0,0.1,0.08,0.52,0.92],[1,0.55,0.2,0.9,0.88]]) # 其余5个锚框坐标 anchors = nd.array([[0,0.1,0.2,0.3],[0.15,0.2,0.4,0.4],[0.63,0.05,0.88,0.98],[0.66,0.45,0.8,0.8], [0.57,0.3,0.92,0.9]]) fig = d2l.plt.imshow(img) # k与5个锚框颜色区分。 show_bboxes(fig.axes,ground_truth[:,1:]*bbox_scale,['dog','cat'],'k') show_bboxes(fig.axes,anchors*bbox_scale,['0','1','2','3','4']) d2l.plt.show()
#In[1] from mxnet import image,contrib,gluon,nd import numpy as np import d2lzh as d2l np.set_printoptions(2) #In[2] img = image.imread('img/catdog.jpg').asnumpy() h,w = img.shape[0:2] print(h,w) X = nd.random.uniform(shape=(1,3,h,w)) Y = contrib.nd.MultiBoxPrior(X,sizes=[0.75,0.5,0.25],ratios=[1,2,0.5]) Y.shape #In[3] boxes = Y.reshape((h,w,5,4)) #In[4] def show_bboxes(axes,bboxes,labels=None,colors=None): def _make_list(obj,default_values=None): if obj is None: obj = default_values elif not isinstance(obj,(list,tuple)): obj = [obj] return obj labels = _make_list(labels) #['s=0.75,r=1', 's=0.5,r=1', 's=0.25,r=1', 's=0.75,r=2', 's=0.75,r=0.5'] colors = _make_list(colors,['b','g','r','m','c']) #['b', 'g', 'r', 'm', 'c'] for i,bbox in enumerate(bboxes): color = colors[i%len(colors)] rect = d2l.bbox_to_rect(bbox.asnumpy(),color) axes.add_patch(rect) if labels and len(labels)>i: #个数判定 text_color = 'k' if color == 'w' else 'w' # 因为是颜色,w是白色 #增加外框,如文字说名. axes.text(rect.xy[0],rect.xy[1],labels[i],va='center',ha='center', fontsize=9,color=text_color,bbox=dict(facecolor=color,lw=0)) #In[5] d2l.set_figsize() bbox_scale = nd.array((w,h,w,h)) #In[6] # 第一个元素是类别,0为狗,1为猫,剩下4个是锚框坐标 ground_truth = nd.array([[0,0.1,0.08,0.52,0.92],[1,0.55,0.2,0.9,0.88]]) # 其余5个锚框坐标 anchors = nd.array([[0,0.1,0.2,0.3],[0.15,0.2,0.4,0.4],[0.63,0.05,0.88,0.98],[0.66,0.45,0.8,0.8], [0.57,0.3,0.92,0.9]]) #In[7] #MultiBoxTarget函数为锚框标注类别和偏移量. #expand_dims扩展数组形状,原来是5x4,现在是1x5x4 #nd.zeros((1,3,5)) 生成1x3x5的0矩阵 #第一个元素,输入的锚框,形状为(1,锚框总数,4) #第二个元素,训练集的真实标签,形状为(批量大小,每张图片最多真实锚框数,5,类别标签+坐标值(归一化)) #第三个元素,输入的锚框,预测类别分数,形状为(批量大小,预测类别+1,锚框总数) labels = contrib.nd.MultiBoxTarget(anchors.expand_dims(axis=0),ground_truth.expand_dims(axis=0), nd.zeros((1,3,5))) #返回结果第三项,锚框标注类别,背景设为0,开始索引自加1 labels[2] #[[0. 1. 2. 0. 2.]] # 锚框0,交并比小于某一阈值,归为背景,其他类似. #返回结果第二项掩码(mask),形状为(批量大小,锚框数的4倍),与每个锚框4个偏移量对应。 labels[1] #[[0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 1. 1. 1. 1.]] #返回结果第一项是每个锚框标注的4个偏移量,负类锚框偏移量为0 labels[0] #[[ 0.00e+00 0.00e+00 0.00e+00 0.00e+00 1.40e+00 1.00e+01 2.59e+00 # 7.18e+00 -1.20e+00 2.69e-01 1.68e+00 -1.57e+00 0.00e+00 0.00e+00 # 0.00e+00 0.00e+00 -5.71e-01 -1.00e+00 -8.94e-07 6.26e-01]] #In[8] #输出预测边界框,非极大值抑制. anchors = nd.array([[0.1,0.08,0.52,0.92],[0.08,0.2,0.56,0.95],[0.15,0.3,0.62,0.91], [0.55,0.2,0.9,0.88]]) #16 offset_preds = nd.array([0]*anchors.size) #假设预测偏移量为0 cls_probs = nd.array([[0]*4, #背景的预测概率 [0.9,0.8,0.7,0.1], #狗的预测概率 [0.1,0.2,0.3,0.9] #猫的预测概率 ])