1 2 3 4 5 6 7 %matplotlib inline %config InlineBackend.figure_format="retina" %config InlineBackend.rc = {"figure.figsize" : (7.5 ,4.5 )} %reload_ext autoreload %autoreload 2
1 2 3 4 5 6 7 8 9 10 from fastai.conv_learner import *from fastai.dataset import *import json, pdbfrom PIL import ImageDraw, ImageFontfrom matplotlib import patches, patheffectstorch.cuda.set_device(1 )
参考链接:什么情况下应该设置 cudnn.benchmark = True?
大部分情况下,设置这个 flag 可以让内置的 cuDNN 的 auto-tuner 自动寻找最适合当前配置的高效算法,来达到优化运行效率的问题。
如果网络的输入数据维度或类型上变化不大,设置 torch.backends.cudnn.benchmark = true 可以增加运行效率;
如果网络的输入数据在每次 iteration 都变化的话,会导致 cnDNN 每次都会去寻找一遍最优配置,这样反而会降低运行效率。
1 2 torch.backends.cudnn.benchmark=True
1 2 3 4 5 6 7 8 9 10 11 PATH = Path('data/pascal2007' ) trin_json = json.load((PATH / 'pascal_train2007.json' ).open ()) IMAGES,ANNOTATIONS,CATEGORIES = ['images' , 'annotations' , 'categories' ] FILE_NAME,ID,IMG_ID,CAT_ID,BBOX = ['file_name' ,'id' ,'image_id' ,'category_id' ,'bbox' ] JPEGS = 'VOCdevkit/VOC2007/JPEGImages' IMG_PATH = PATH / JPEGS
1 2 3 4 5 6 7 catgoris = {o[ID]:o['name' ] for o in trin_json[CATEGORIES]} trinFnams = {o[ID]:o[FILE_NAME] for o in trin_json[IMAGES]} trinIDs = [o[ID] for o in trin_json[IMAGES]]
1 2 3 4 5 6 7 def hw_bb (bb ): return np.array([bb[1 ],bb[0 ],bb[3 ]+bb[1 ]-1 , bb[2 ]+bb[0 ]-1 ]) def bb_hw (bb ): return np.array([bb[1 ],bb[0 ],bb[3 ]-bb[1 ]+1 ,bb[2 ]-bb[0 ]+1 ])
1 2 3 4 5 6 7 8 9 10 11 def get_trn_anno (): trin_anno = collections.defaultdict(lambda :[]) for o in trin_json[ANNOTATIONS]: if not o['ignore' ]: bb = o[BBOX] bb = np.array(hw_bb(bb)) trin_anno[o[IMG_ID]].append((bb,o[CAT_ID])) return trin_anno trinAnnos = get_trn_anno()
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 def show_img (im, figsize=None , ax=None ): if not ax: fig,ax = plt.subplots(figsize=figsize) ax.imshow(im) ax.set_xticks(np.linspace(0 , 224 , 8 )) ax.set_yticks(np.linspace(0 , 224 , 8 )) ax.grid() ax.set_yticklabels([]) ax.set_xticklabels([]) return ax def draw_outline (o, lw ): o.set_path_effects([patheffects.Stroke( linewidth=lw, foreground='black' ), patheffects.Normal()]) def draw_rect (ax, b, color='white' ): patch = ax.add_patch(patches.Rectangle(b[:2 ], *b[-2 :], fill=False , edgecolor=color, lw=2 )) draw_outline(patch, 4 ) def draw_text (ax, xy, txt, sz=14 , color='white' ): text = ax.text(*xy, txt, verticalalignment='top' , color=color, fontsize=sz, weight='bold' ) draw_outline(text, 1 ) def draw_im (im, ann ): ax = show_img(im, figsize=(16 ,8 )) for b,c in ann: b = bb_hw(b) draw_rect(ax, b) draw_text(ax, b[:2 ], catgoris[c], sz=16 ) def draw_idx (i ): im_a = trinAnnos[i] im = open_image(IMG_PATH/trinFnams[i]) draw_im(im, im_a)
多标签分类问题 如同卫星图像分类一样,一幅图分配几个分类标签
数据预处理 1 MC_CSV = PATH/'tmp/mc.csv'
[(array([ 96, 155, 269, 350]), 7)]
1 2 3 4 multiClass = [set (catgoris[p[1 ]] for p in trinAnnos[o]) for o in trinIDs] multiClasses = [' ' .join(str (p) for p in o) for o in multiClass]
1 2 3 df = pd.DataFrame({'fn' : [trinFnams[o] for o in trinIDs], 'clas' : multiClasses}, columns=['fn' ,'clas' ]) df.to_csv(MC_CSV, index=False )
多分类模型和训练 1 2 3 f_model=resnet34 sz=224 bs=64
1 2 tfms = tfms_from_model(f_model=f_model, sz=sz,crop_type=CropType.NO) md = ImageClassifierData.from_csv(path=PATH, folder=JPEGS, csv_fname=MC_CSV, tfms=tfms, bs=bs)
1 2 3 learn = ConvLearner.pretrained(f_model,data=md) learn.opt_fn = optim.Adam
1 2 lrf=learn.lr_find(1e-5 ,100 )
1, 1 , cycle_len=3 , use_clr=(32 ,5 ))
1 lrs = np.array([lr/100 , lr/10 , lr])
1 2 learn.lr_find(lrs/1000 ) learn.sched.plot(0 )
1 , 1 , cycle_len=5 , use_clr=(32 ,5 ))
1 2 3'mclas' ) learn.load('mclas' )
1 2 3 y = learn.predict() x,_ = next (iter (md.val_dl)) x = to_np(x)
1 2 3 4 5 6 7 8 9 fig, axes = plt.subplots(3 , 4 , figsize=(12 , 8 )) for i,ax in enumerate (axes.flat): ima=md.val_ds.denorm(x)[i] ya = np.nonzero(y[i]>0.4 )[0 ] b = '\n' .join(md.classes[o] for o in ya) ax = show_img(ima, ax=ax) draw_text(ax, (0 ,0 ), b) plt.tight_layout()
目标检测 按照Howard教授讲的,实现一个粗糙版本的SSD,继而进行改进。
1 2 3 4 f_model=resnet34 sz=224 bs=64
数据 构建多分类数据集 1 2 3 4 5 6 multiClass = [[catgoris[p[1 ]] for p in trinAnnos[o]] for o in trinIDs] Id2Catgris = list (catgoris.values()) Catgris2Id = {v:k for k,v in enumerate (Id2Catgris)}
1 2 multiClasses = np.array([np.array([Catgris2Id[p] for p in o]) for o in multiClass])
array([array([6]), array([14, 12]), array([ 1, 1, 14, 14, 14]), ..., array([17, 8, 14, 14, 14]),
array([6]), array([11])], dtype=object)
1 2 3 val_idxs = get_cv_idxs(len (trinFnams)) ((val_mcs,trn_mcs),) = split_by_idx(val_idxs, multiClasses)
构建BBox的数据集 1 MBB_CSV = PATH/'tmp/mbb.csv'
1 2 3 4 5 6 multiBBox = [np.concatenate([p[0 ] for p in trinAnnos[o]]) for o in trinIDs] multiBBoxes = [' ' .join(str (p) for p in o) for o in multiBBox] df = pd.DataFrame({'fn' :[trinFnams[o] for o in trinIDs], 'bbox' :multiBBoxes}, columns=['fn' ,'bbox' ]) df.to_csv(MBB_CSV, index=False )
96 155 269 350
61 184 198 278 77 89 335 402
229 8 499 244 219 229 499 333 0 1 368 116 1 2 …
124 89 211 336
77 103 182 374 87 132 122 196 179 194 228 212 …
1 2 3 4 5 6 aug_tfms = [RandomRotate(3 , p=0.5 , tfm_y=TfmType.COORD), RandomLighting(0.05 , 0.05 , tfm_y=TfmType.COORD), RandomFlip(tfm_y=TfmType.COORD)] tfms = tfms_from_model(f_model, sz, crop_type=CropType.NO, tfm_y=TfmType.COORD, aug_tfms=aug_tfms) md = ImageClassifierData.from_csv(PATH, JPEGS, MBB_CSV, tfms=tfms, bs=bs, continuous=True , num_workers=4 )
对多分类数据集和BBox数据集进行拼接 1 2 3 4 5 6 7 8 9 10 11 class ConcatLblDataset (Dataset ): def __init__ (self, ds, y2 ): self.ds,self.y2 = ds,y2 = def __len__ (self ): return len (self.ds) def __getitem__ (self, i ): x,y = self.ds[i] return (x, (y,self.y2[i]))
1 2 3 4 5 trn_ds2 = ConcatLblDataset(md.trn_ds, trn_mcs) val_ds2 = ConcatLblDataset(md.val_ds, val_mcs) md.trn_dl.dataset = trn_ds2 md.val_dl.dataset = val_ds2
查看新构建的数据集 1 2 3 4 5 6 7 8 9 10 11 import as cmximport matplotlib.colors as mcolorsfrom cycler import cyclerdef get_cmap (N ): color_norm = mcolors.Normalize(vmin=0 , vmax=N-1 ) return cmx.ScalarMappable(norm=color_norm, cmap='Set3' ).to_rgba num_colr = 12 cmap = get_cmap(num_colr) colr_list = [cmap(float (x)) for x in range (num_colr)]
1 2 3 4 5 6 7 8 9 10 11 12 13 def show_ground_truth (ax, im, bbox, clas=None , prs=None , thresh=0.3 ): bb = [bb_hw(o) for o in bbox.reshape(-1 ,4 )] if prs is None : prs = [None ]*len (bb) if clas is None : clas = [None ]*len (bb) ax = show_img(im, ax=ax) for i,(b,c,pr) in enumerate (zip (bb, clas, prs)): if ((b[2 ]>1 ) and (pr is None or pr > thresh)): draw_rect(ax, b, color=colr_list[i%num_colr]) txt = f'{i} : ' if c is not None : txt += ('bg' if c==len (Id2Catgris) else Id2Catgris[c]) if pr is not None : txt += f' {pr:.2 f} ' draw_text(ax, b[:2 ], txt, color=colr_list[i%num_colr])
1 2 3 x,y = to_np(next (iter (md.val_dl))) x = md.val_ds.ds.denorm(x)
1 2 3 4 5 fig, axes = plt.subplots(3 ,4 , figsize=(16 ,12 )) for i,ax in enumerate (axes.flat): show_ground_truth(ax, x[i], y[0 ][i], y[1 ][i]) plt.tight_layout()
网络结构 先构建简单的模型,后继续改进。先用4x4 anchor。
SSD的方式是在原有ResNet主干后增加一个stride为2的卷积层,得到4x4的tensor,对每个Tensor做(4+C)的目标检测,共44 (4+C);
anchor设置 参数介绍:
anc_grid = how big of a square grid to make (subdivision)
anc_offset = center offsets
anc_x = x coordinates for centers
anc_y = y coordinates for centers
anc_ctrs - the actual coordinates for the grid centers
anc_sizes - size of the quadrants
1 2 3 4 5 6 7 8 9 10 11 12 anc_grid = 4 k = 1 anc_offset = 1 /(anc_grid*2 ) anc_x = np.repeat(np.linspace(anc_offset, 1 - anc_offset, anc_grid), anc_grid) anc_y = np.tile(np.linspace(anc_offset,1 -anc_offset,anc_grid),anc_grid) anc_ctrs = np.tile(np.stack([anc_x,anc_y], axis=1 ), (k,1 )) anc_sizes = np.array([[1 /anc_grid,1 /anc_grid] for i in range (anc_grid*anc_grid)]) anchors = V(np.concatenate([anc_ctrs, anc_sizes], axis=1 ), requires_grad=False ).float ()
1 grid_sizes = V(np.array([1 /anc_grid]), requires_grad=False ).unsqueeze(1 )
1 2 3 4 5 plt.grid(False ) plt.scatter(anc_x, anc_y) plt.xlim(0 , 1 ) plt.ylim(0 , 1 );
1 2 3 def hw2corners (ctr, hw ): return[ctr-hw/2 ,ctr+hw/2 ],dim=1 )
1 2 3 anchor_cnr = hw2corners(anchors[:,:2 ],anchors[:,2 :]) anchor_cnr
自定义高级网络层 以ResNet34为主干,追加更多的卷积层。现在是一层卷积层,用于4x4grid。
1 2 n_clas = len (Id2Catgris)+1 n_act = k*(4 +n_clas)
关于flatten的方式 (pytorch中contiguous())[]:
view只能用在contiguous的variable上。如果在view之前用了transpose, permute等,需要用contiguous()来返回一个contiguous copy。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 class StdConv (nn.Module): def __init__ (self, nin, nout, stride=2 , drop=0.1 ): super ().__init__() self.conv = nn.Conv2d(nin, nout, 3 , stride=stride, padding=1 ) = nn.BatchNorm2d(nout) self.drop = nn.Dropout(drop) def forward (self, x ): return self.drop( def flatten_conv (x,k ): bs,nf,gx,gy = x.size() x = x.permute(0 ,2 ,3 ,1 ).contiguous() return x.view(bs,-1 ,nf//k)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 class OutConv (nn.Module): def __init__ (self, k, nin, bias ): super ().__init__() self.k = k self.oconv1 = nn.Conv2d(nin, (len (Id2Catgris)+1 )*k, 3 , padding=1 ) self.oconv2 = nn.Conv2d(nin, 4 *k, 3 , padding=1 ) def forward (self, x ): return [flatten_conv(self.oconv1(x), self.k), flatten_conv(self.oconv2(x), self.k)]
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 class SSD_Head (nn.Module): def __init__ (self, k, bias ): super ().__init__() self.drop = nn.Dropout(0.25 ) self.sconv0 = StdConv(512 ,256 , stride=1 ) self.sconv2 = StdConv(256 ,256 ) self.out = OutConv(k, 256 , bias) def forward (self, x ): x = self.drop(F.relu(x)) x = self.sconv0(x) x = self.sconv2(x) return self.out(x)
1 2 3 4 5 head_reg4 = SSD_Head(k, -3. ) models = ConvnetBuilder(f_model, 0 , 0 , 0 , custom_head=head_reg4) learn = ConvLearner(md, models) learn.opt_fn = optim.Adam
损失函数 损失函数需要先对图像中的目标对应到最后卷积层的其中一个grid中,就可以说是“这个grid是对这一个目标对标的”;
将预测得到的activations转换至BBox(anchor 空间)
将Ground Truth的BBox映射到Anchor 空间
检查是否存在覆盖区域大于0.4的(Ground Truth BBox与4x4的anchors)
多分类损失函数 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 def one_hot_embedding (labels, num_classes ): return torch.eye(num_classes)[] class BCE_Loss (nn.Module): def __init__ (self, num_classes ): super ().__init__() self.num_classes = num_classes def forward (self, pred, targ ): t = one_hot_embedding(targ, self.num_classes+1 ) t = V(t[:,:-1 ].contiguous()) x = pred[:,:-1 ] w = self.get_weight(x,t) return F.binary_cross_entropy_with_logits(x, t, w, size_average=False )/self.num_classes def get_weight (self,x,t ): return None
1 2 loss_f = BCE_Loss(len (Id2Catgris))
IOU 计算(定位损失函数) 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 def intersect (box_a, box_b ): max_xy = torch.min (box_a[:, None , 2 :], box_b[None , :, 2 :]) min_xy = torch.max (box_a[:, None , :2 ], box_b[None , :, :2 ]) inter = torch.clamp((max_xy - min_xy), min =0 ) return inter[:, :, 0 ] * inter[:, :, 1 ] def box_sz (b ): return ((b[:, 2 ]-b[:, 0 ]) * (b[:, 3 ]-b[:, 1 ]))def jaccard (box_a, box_b ): inter = intersect(box_a, box_b) union = box_sz(box_a).unsqueeze(1 ) + box_sz(box_b).unsqueeze(0 ) - inter return inter / union
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 def get_y (bbox,clas ): bbox = bbox.view(-1 ,4 )/sz bb_keep = ((bbox[:,2 ]-bbox[:,0 ])>0 ).nonzero()[:,0 ] return bbox[bb_keep],clas[bb_keep] def actn_to_bb (actn, anchors ): actn_bbs = torch.tanh(actn) actn_centers = (actn_bbs[:,:2 ]/2 * grid_sizes) + anchors[:,:2 ] actn_hw = (actn_bbs[:,2 :]/2 +1 ) * anchors[:,2 :] return hw2corners(actn_centers, actn_hw) def map_to_ground_truth (overlaps, print_it=False ): prior_overlap, prior_idx = overlaps.max (1 ) if print_it: print (prior_overlap) gt_overlap, gt_idx = overlaps.max (0 ) gt_overlap[prior_idx] = 1.99 for i,o in enumerate (prior_idx): gt_idx[o] = i return gt_overlap,gt_idx def ssd_1_loss (b_c,b_bb,bbox,clas,print_it=False ): bbox,clas = get_y(bbox,clas) a_ic = actn_to_bb(b_bb, anchors) overlaps = jaccard(, gt_overlap,gt_idx = map_to_ground_truth(overlaps,print_it) gt_clas = clas[gt_idx] pos = gt_overlap > 0.4 pos_idx = torch.nonzero(pos)[:,0 ] gt_clas[1 -pos] = len (Id2Catgris) gt_bbox = bbox[gt_idx] loc_loss = ((a_ic[pos_idx] - gt_bbox[pos_idx]).abs ()).mean() clas_loss = loss_f(b_c, gt_clas) return loc_loss, clas_loss
1 2 3 4 5 6 7 8 9 10 11 12 13 def ssd_loss (pred,targ,print_it=False ): lcs,lls = 0. ,0. for b_c,b_bb,bbox,clas in zip (*pred,*targ): loc_loss,clas_loss = ssd_1_loss(b_c,b_bb,bbox,clas,print_it) lls += loc_loss lcs += clas_loss if print_it: print (f'loc: {[0 ]} , clas: {[0 ]} ' ) return lls+lcs
Loss测试 确保loss函数可行
1 2 x,y = next (iter (md.val_dl)) x,y = V(x),V(y)
1 2 for i,o in enumerate (y): y[i] = o.cuda()learn.model.cuda()
1 ssd_loss(batch, y, True )
训练模型 1 2 3 learn.crit = ssd_loss lr = 3e-3 lrs = np.array([lr/100 ,lr/10 ,lr])
1 2 learn.lr_find(lrs/1000 ,1. ) learn.sched.plot(1 )
1, 1 , cycle_len=5 , use_clr=(20 ,10 ))
1 2 3'0' ) learn.load('0' )
测试模型 直接预测 1 2 3 4 5 x,y = next (iter (md.val_dl)) x,y = V(x),V(y) learn.model.eval () batch = learn.model(x) b_clas,b_bb = batch
1 b_clas.size(),b_bb.size()
1 2 3 4 5 6 7 idx=7 b_clasi = b_clas[idx] b_bboxi = b_bb[idx] ima=md.val_ds.ds.denorm(to_np(x))[idx] bbox,clas = get_y(y[0 ][idx], y[1 ][idx]) bbox,clas
1 2 3 4 def torch_gt (ax, ima, bbox, clas, prs=None , thresh=0.4 ): return show_ground_truth(ax, ima, to_np((bbox*224 ).long()), to_np(clas), to_np(prs) if prs is not None else None , thresh)
1 2 3 fig, ax = plt.subplots(figsize=(7 ,7 )) torch_gt(ax, ima, bbox, clas)
1 2 3 fig, ax = plt.subplots(figsize=(7 ,7 )) torch_gt(ax, ima, anchor_cnr, b_clasi.max (1 )[1 ])
1 2 a_ic = actn_to_bb(b_bboxi, anchors)
1 2 3 fig, ax = plt.subplots(figsize=(7 ,7 )) torch_gt(ax, ima, a_ic, b_clasi.max (1 )[1 ], b_clasi.max (1 )[0 ].sigmoid(), thresh=0.0 )
1 2 3 overlaps = jaccard(, overlaps
1 2 3 gt_overlap,gt_idx = map_to_ground_truth(overlaps) gt_overlap,gt_idx
1 2 gt_clas = clas[gt_idx]; gt_clas
1 2 3 4 5 thresh = 0.5 pos = gt_overlap > thresh pos_idx = torch.nonzero(pos)[:,0 ] neg_idx = torch.nonzero(1 -pos)[:,0 ] pos_idx
1 2 3 gt_clas[1 -pos] = len (Id2Catgris) [Id2Catgris[o] if o<len (Id2Catgris) else 'bg' for o in]
1 2 3 4 5 gt_bbox = bbox[gt_idx] loc_loss = ((a_ic[pos_idx] - gt_bbox[pos_idx]).abs ()).mean() clas_loss = F.cross_entropy(b_clasi, gt_clas) loc_loss,clas_loss
1 2 3 4 5 6 7 8 9 10 fig, axes = plt.subplots(3 , 4 , figsize=(16 , 12 )) for idx,ax in enumerate (axes.flat): ima=md.val_ds.ds.denorm(to_np(x))[idx] bbox,clas = get_y(y[0 ][idx], y[1 ][idx]) ima=md.val_ds.ds.denorm(to_np(x))[idx] bbox,clas = get_y(bbox,clas); bbox,clas a_ic = actn_to_bb(b_bb[idx], anchors) torch_gt(ax, ima, a_ic, b_clas[idx].max (1 )[1 ], b_clas[idx].max (1 )[0 ].sigmoid(), 0.01 ) plt.tight_layout()
进一步的改进:更多的anchor box 提升的途径:
不同大小的Anchor Boxes -> anc_zooms
不同长宽比的Anchor Boxes -> anc_ratios
使用更多的卷积层来产生Anchor Boxes -> anc_grids
创建更多的anchor 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 anc_grids = [4 ,2 ,1 ] anc_zooms = [0.7 , 1. , 1.3 ] anc_ratios = [(1. ,1. ), (1. ,0.5 ), (0.5 ,1. )] anchor_scales = [(anz*i, anz*j) for anz in anc_zooms for (i,j) in anc_ratios] k = len (anchor_scales) anc_offsets = [1 /(o*2 ) for o in anc_grids] k
1 2 3 4 5 6 anc_x = np.concatenate([np.repeat(np.linspace(ao, 1 -ao,ag),ag) for ao,ag in zip (anc_offsets, anc_grids)]) anc_y = np.concatenate([np.tile(np.linspace(ao,1 -ao,ag), ag) for ao,ag in zip (anc_offsets,anc_grids)]) anc_ctrs = np.repeat(np.stack([anc_x,anc_y],axis=1 ),k,axis=0 )
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 anc_sizes = np.concatenate([np.array([[o/ag,p/ag] for i in range (ag*ag) for o,p in anchor_scales]) for ag in anc_grids]) grid_sizes = V(np.concatenate([np.array([ 1 /ag for i in range (ag*ag) for o,p in anchor_scales]) for ag in anc_grids]), requires_grad=False ).unsqueeze(1 ) anchors = V(np.concatenate([anc_ctrs, anc_sizes], axis=1 ), requires_grad=False ).float () anchor_cnr = hw2corners(anchors[:,:2 ],anchors[:,2 :])
1 2 x,y=to_np(next (iter (md.val_dl))) x=md.val_ds.ds.denorm(x)
1 a=np.reshape((to_np(anchor_cnr) + to_np(torch.randn(*anchor_cnr.size()))*0.01 )*224 , -1 )
1 2 fig, ax = plt.subplots(figsize=(7 ,7 )) show_ground_truth(ax, x[0 ], a)
activations与Ground Truth进行对比,计算loss,然后就是求导更新权重;
【匹配问题】定义的损失函数loss function能够比较activation和ground truth,计算得到的值用于衡量activation的好坏。为此,对于ground truth中的每个Object,需要决定一组(4+C)的activation与之对比。
因为使用的SSD的方式,所以匹配的anchorBox并不是随意的,用于匹配的activation,其感受野(reception filed)需要与ground truth中的Object所处的位置具有最大的重叠。
grid cell可以使不同大小的(anc_grid),grid cell中可以有不同长宽比和缩放比的anchor box;
anchor box的数目即对应了卷积层activation的组数目,但并不表示每个卷积层需要那么多组activation,因为4x4的卷积层有16组,2x2的卷积层有4组,1x1的卷积层有1组,这样就有了1+4+16组。
接下来只需要知道参数k,k表示的是长宽比和缩放比的组合数。这样,1xk+4xk+16xk,就是全部的anchor box数目了。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 drop=0.4 class SSD_MultiHead (nn.Module): def __init__ (self, k, bias ): super ().__init__() self.drop = nn.Dropout(drop) self.sconv0 = StdConv(512 ,256 , stride=1 , drop=drop) self.sconv1 = StdConv(256 ,256 , drop=drop) self.sconv2 = StdConv(256 ,256 , drop=drop) self.sconv3 = StdConv(256 ,256 , drop=drop) self.out0 = OutConv(k, 256 , bias) self.out1 = OutConv(k, 256 , bias) self.out2 = OutConv(k, 256 , bias) self.out3 = OutConv(k, 256 , bias) def forward (self, x ): x = self.drop(F.relu(x)) x = self.sconv0(x) x = self.sconv1(x) o1c,o1l = self.out1(x) x = self.sconv2(x) o2c,o2l = self.out2(x) x = self.sconv3(x) o3c,o3l = self.out3(x) return [[o1c,o2c,o3c], dim=1 ),[o1l,o2l,o3l], dim=1 )] head_reg4 = SSD_MultiHead(k, -4. ) models = ConvnetBuilder(f_model, 0 , 0 , 0 , custom_head=head_reg4) learn = ConvLearner(md, models) learn.opt_fn = optim.Adam
训练和测试 1 2 3 learn.crit = ssd_loss lr = 1e-2 lrs = np.array([lr/100 ,lr/10 ,lr])
1 2 3 x,y = next (iter (md.val_dl)) x,y = V(x),V(y) batch = learn.model(V(x))
1 2 learn.lr_find(lrs/1000 ,1. ) learn.sched.plot(n_skip_end=2 )
1 2 learn.freeze_to(-2 ) , 1 , cycle_len=4 , use_clr=(20 ,8 ))
1 2 3 4 5 6 7 8 9 10 11 12 13 14 x,y = next (iter (md.val_dl)) y = V(y) batch = learn.model(V(x)) b_clas,b_bb = batch x = to_np(x) fig, axes = plt.subplots(3 , 4 , figsize=(16 , 12 )) for idx,ax in enumerate (axes.flat): ima=md.val_ds.ds.denorm(x)[idx] bbox,clas = get_y(y[0 ][idx], y[1 ][idx]) a_ic = actn_to_bb(b_bb[idx], anchors) torch_gt(ax, ima, a_ic, b_clas[idx].max (1 )[1 ], b_clas[idx].max (1 )[0 ].sigmoid(), 0.21 ) plt.tight_layout()
Focal Loss 论文《Focal Loss for Dense Object Detection》
Focal Loss 就是一个解决分类问题中类别不平衡、分类难度差异的一个 loss。
相关博文:《何恺明大神的「Focal Loss」,如何更好地理解?》
定义Focal Loss 1 2 3 4 5 6 7 8 9 10 class FocalLoss (BCE_Loss ): def get_weight (self,x,t ): alpha,gamma = 0.25 ,1 p = x.sigmoid() pt = p*t + (1 -p)*(1 -t) w = alpha*t + (1 -alpha)*(1 -t) return w * (1 -pt).pow (gamma) loss_f = FocalLoss(len (Id2Catgris))
1 2 3 4 5 x,y = next (iter (md.val_dl)) x,y = V(x),V(y) batch = learn.model(x) ssd_loss(batch, y, True )
训练和测试 1 2 learn.lr_find(lrs/1000 ,1. ) learn.sched.plot(n_skip_end=1 )
1, 1 , cycle_len=10 , use_clr=(20 ,10 ))
1 2 3'fl0' ) learn.load('fl0' )
1 2 3 learn.freeze_to(-2 ) , 1 , cycle_len=10 , use_clr=(20 ,10 ))
1 2'drop4' ) learn.load('drop4' )
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 def plot_results (thresh ): x,y = next (iter (md.val_dl)) y = V(y) batch = learn.model(V(x)) b_clas,b_bb = batch x = to_np(x) fig, axes = plt.subplots(3 , 4 , figsize=(16 , 12 )) for idx,ax in enumerate (axes.flat): ima=md.val_ds.ds.denorm(x)[idx] bbox,clas = get_y(y[0 ][idx], y[1 ][idx]) a_ic = actn_to_bb(b_bb[idx], anchors) clas_pr, clas_ids = b_clas[idx].max (1 ) clas_pr = clas_pr.sigmoid() torch_gt(ax, ima, a_ic, clas_ids, clas_pr, clas_pr.max ().data[0 ]*thresh) plt.tight_layout()
非极大抑制(NMS) 按照Howard的说法,NMS好理解,但比较繁琐,他也是直接摘的网上的一段代码。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 def nms (boxes, scores, overlap=0.5 , top_k=100 ): keep = )).zero_().long() if boxes.numel() == 0 : return keep x1 = boxes[:, 0 ] y1 = boxes[:, 1 ] x2 = boxes[:, 2 ] y2 = boxes[:, 3 ] area = torch.mul(x2 - x1, y2 - y1) v, idx = scores.sort(0 ) idx = idx[-top_k:] xx1 = yy1 = xx2 = yy2 = w = h = count = 0 while idx.numel() > 0 : i = idx[-1 ] keep[count] = i count += 1 if idx.size(0 ) == 1 : break idx = idx[:-1 ] torch.index_select(x1, 0 , idx, out=xx1) torch.index_select(y1, 0 , idx, out=yy1) torch.index_select(x2, 0 , idx, out=xx2) torch.index_select(y2, 0 , idx, out=yy2) xx1 = torch.clamp(xx1, min =x1[i]) yy1 = torch.clamp(yy1, min =y1[i]) xx2 = torch.clamp(xx2, max =x2[i]) yy2 = torch.clamp(yy2, max =y2[i]) w.resize_as_(xx2) h.resize_as_(yy2) w = xx2 - xx1 h = yy2 - yy1 w = torch.clamp(w, min =0.0 ) h = torch.clamp(h, min =0.0 ) inter = w*h rem_areas = torch.index_select(area, 0 , idx) union = (rem_areas - inter) + area[i] IoU = inter/union idx = idx[IoU.le(overlap)] return keep, count
1 2 3 4 5 6 x,y = next (iter (md.val_dl)) y = V(y) batch = learn.model(V(x)) b_clas,b_bb = batch x = to_np(x)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 def show_nmf (idx ): ima=md.val_ds.ds.denorm(x)[idx] bbox,clas = get_y(y[0 ][idx], y[1 ][idx]) a_ic = actn_to_bb(b_bb[idx], anchors) clas_pr, clas_ids = b_clas[idx].max (1 ) clas_pr = clas_pr.sigmoid() conf_scores = b_clas[idx].sigmoid().t().data out1,out2,cc = [],[],[] for cl in range (0 , len (conf_scores)-1 ): c_mask = conf_scores[cl] > 0.25 if c_mask.sum () == 0 : continue scores = conf_scores[cl][c_mask] l_mask = c_mask.unsqueeze(1 ).expand_as(a_ic) boxes = a_ic[l_mask].view(-1 , 4 ) ids, count = nms(, scores, 0.4 , 50 ) ids = ids[:count] out1.append(scores[ids]) out2.append([ids]) cc.append([cl]*count) if not cc: print (f"{i} : empty array" ) return cc = T(np.concatenate(cc)) out1 = out2 = fig, ax = plt.subplots(figsize=(8 ,8 )) torch_gt(ax, ima, out2, cc, out1, 0.1 )
1 for i in range (12 ): show_nmf(i)
End 效果还是有待提升,先去看SSD 论文了。
SSD: Single Shot MultiBox Detector