深度学习笔记-计算机视觉

本文不阐述各个知识点具体内容，只给出代码实现和理解，其中涉及到的知识点如下

matplotlib画图
锚框的概念和实现
交并比
偏移量
非极大值抑制

目标检测和边框值

import d2lzh as d2l from mxnet import image  d2l.set_figsize() img = image.imread('img/catdog.jpg').asnumpy()  # 边界框(bounding_box) dog_bbox,cat_bbox = [60,45,378,516],[400,112,655,493] # bbox_to_reat 将边界框表示成matplotlib的边界形式 def bbox_to_rect(bbox,color): 	# 左上x,左上y，右下x，右下y     # ((左上x,左上y)，宽，高)     return d2l.plt.Rectangle(         xy=(bbox[0],bbox[1]),width=bbox[2]-bbox[0],height=bbox[3]-bbox[1],         fill=False,edgecolor=color,linewidth=2         # 不填充，边界颜色color，线宽2     ) fig = d2l.plt.imshow(img) fig.axes.add_patch(bbox_to_rect(dog_bbox,'blue')) fig.axes.add_patch(bbox_to_rect(cat_bbox,'red'))

锚框

以每个像素的中心生成多个大小和宽高比不同的边界框。这些边界框称为锚框

 from mxnet import image,contrib,gluon,nd import numpy as np import d2lzh as d2l np.set_printoptions(2)  img = image.imread('img/catdog.jpg').asnumpy() h,w = img.shape[0:2]	#高和宽 print(h,w) X = nd.random.uniform(shape=(1,3,h,w)) Y = contrib.nd.MultiBoxPrior(X,sizes=[0.75,0.5,0.25],ratios=[1,2,0.5])	# 一个像素5个锚框 Y.shape	#(1,2042040,4) boxes = Y.reshape((h,w,5,4))	  print(boxes[250,250,:,:])  def show_bboxes(axes,bboxes,labels=None,colors=None):     def _make_list(obj,default_values=None):         if obj is None:             obj = default_values         elif not isinstance(obj,(list,tuple)):             obj = [obj]         return obj     labels = _make_list(labels)     #['s=0.75,r=1', 's=0.5,r=1', 's=0.25,r=1', 's=0.75,r=2', 's=0.75,r=0.5']     colors = _make_list(colors,['b','g','r','m','c'])        #['b', 'g', 'r', 'm', 'c']     for i,bbox in enumerate(bboxes):         color = colors[i%len(colors)]         rect = d2l.bbox_to_rect(bbox.asnumpy(),color)         axes.add_patch(rect)         print(i)         if labels and len(labels)>i:    #个数判定             text_color = 'k' if color == 'w' else 'w'   # 因为是颜色,w是白色             #增加外框，如文字说名.             axes.text(rect.xy[0],rect.xy[1],labels[i],va='center',ha='center',                       fontsize=9,color=text_color,bbox=dict(facecolor=color,lw=0)) d2l.set_figsize() bbox_scale = nd.array((w,h,w,h)) fig = d2l.plt.imshow(img)    # 因为x轴和y轴坐标除了宽和高，这里需要还原. show_bboxes(fig.axes,boxes[250,250,:,:]*bbox_scale,['s=0.75,r=1','s=0.5,r=1',                                                         's=0.25,r=1','s=0.75,r=2',                                                         's=0.75,r=0.5'])  d2l.plt.show()

# 在上面的基础上,注释掉show_bboxes(fig.axes,boxes[250,250,:,:]*bbox_scale,['s=0.75,r=1','s=0.5,r=1', #                                                      's=0.25,r=1','s=0.75,r=2', #                                                        's=0.75,r=0.5']) #In[6] # 第一个元素是类别，0为狗，1为猫,剩下4个是锚框坐标 ground_truth = nd.array([[0,0.1,0.08,0.52,0.92],[1,0.55,0.2,0.9,0.88]]) # 其余5个锚框坐标 anchors = nd.array([[0,0.1,0.2,0.3],[0.15,0.2,0.4,0.4],[0.63,0.05,0.88,0.98],[0.66,0.45,0.8,0.8],                     [0.57,0.3,0.92,0.9]]) fig = d2l.plt.imshow(img)  # k与5个锚框颜色区分。 show_bboxes(fig.axes,ground_truth[:,1:]*bbox_scale,['dog','cat'],'k') show_bboxes(fig.axes,anchors*bbox_scale,['0','1','2','3','4']) d2l.plt.show()

#In[1] from mxnet import image,contrib,gluon,nd import numpy as np import d2lzh as d2l np.set_printoptions(2)  #In[2] img = image.imread('img/catdog.jpg').asnumpy() h,w = img.shape[0:2] print(h,w) X = nd.random.uniform(shape=(1,3,h,w)) Y = contrib.nd.MultiBoxPrior(X,sizes=[0.75,0.5,0.25],ratios=[1,2,0.5]) Y.shape  #In[3] boxes = Y.reshape((h,w,5,4))  #In[4] def show_bboxes(axes,bboxes,labels=None,colors=None):     def _make_list(obj,default_values=None):         if obj is None:             obj = default_values         elif not isinstance(obj,(list,tuple)):             obj = [obj]         return obj     labels = _make_list(labels)     #['s=0.75,r=1', 's=0.5,r=1', 's=0.25,r=1', 's=0.75,r=2', 's=0.75,r=0.5']     colors = _make_list(colors,['b','g','r','m','c'])        #['b', 'g', 'r', 'm', 'c']     for i,bbox in enumerate(bboxes):         color = colors[i%len(colors)]         rect = d2l.bbox_to_rect(bbox.asnumpy(),color)         axes.add_patch(rect)         if labels and len(labels)>i:    #个数判定             text_color = 'k' if color == 'w' else 'w'   # 因为是颜色,w是白色             #增加外框，如文字说名.             axes.text(rect.xy[0],rect.xy[1],labels[i],va='center',ha='center',                       fontsize=9,color=text_color,bbox=dict(facecolor=color,lw=0))  #In[5] d2l.set_figsize() bbox_scale = nd.array((w,h,w,h))  #In[6] # 第一个元素是类别，0为狗，1为猫,剩下4个是锚框坐标 ground_truth = nd.array([[0,0.1,0.08,0.52,0.92],[1,0.55,0.2,0.9,0.88]]) # 其余5个锚框坐标 anchors = nd.array([[0,0.1,0.2,0.3],[0.15,0.2,0.4,0.4],[0.63,0.05,0.88,0.98],[0.66,0.45,0.8,0.8],                     [0.57,0.3,0.92,0.9]])  #In[7] #MultiBoxTarget函数为锚框标注类别和偏移量. #expand_dims扩展数组形状,原来是5x4，现在是1x5x4 #nd.zeros((1,3,5)) 生成1x3x5的0矩阵 #第一个元素,输入的锚框,形状为(1,锚框总数,4) #第二个元素,训练集的真实标签,形状为(批量大小,每张图片最多真实锚框数,5,类别标签+坐标值(归一化)) #第三个元素,输入的锚框,预测类别分数，形状为(批量大小,预测类别+1,锚框总数) labels = contrib.nd.MultiBoxTarget(anchors.expand_dims(axis=0),ground_truth.expand_dims(axis=0),                                    nd.zeros((1,3,5))) #返回结果第三项，锚框标注类别,背景设为0，开始索引自加1 labels[2]   #[[0. 1. 2. 0. 2.]] # 锚框0,交并比小于某一阈值，归为背景，其他类似. #返回结果第二项掩码(mask),形状为(批量大小,锚框数的4倍),与每个锚框4个偏移量对应。 labels[1]   #[[0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 1. 1. 1. 1.]] #返回结果第一项是每个锚框标注的4个偏移量，负类锚框偏移量为0 labels[0] #[[ 0.00e+00  0.00e+00  0.00e+00  0.00e+00  1.40e+00  1.00e+01  2.59e+00 #  7.18e+00 -1.20e+00  2.69e-01  1.68e+00 -1.57e+00  0.00e+00  0.00e+00 #  0.00e+00  0.00e+00 -5.71e-01 -1.00e+00 -8.94e-07  6.26e-01]]  #In[8] #输出预测边界框，非极大值抑制. anchors = nd.array([[0.1,0.08,0.52,0.92],[0.08,0.2,0.56,0.95],[0.15,0.3,0.62,0.91],                     [0.55,0.2,0.9,0.88]])   #16 offset_preds = nd.array([0]*anchors.size)   #假设预测偏移量为0 cls_probs = nd.array([[0]*4,                #背景的预测概率                       [0.9,0.8,0.7,0.1],    #狗的预测概率                       [0.1,0.2,0.3,0.9]     #猫的预测概率                       ])

标签

bbox