【python / mxnet / gluoncv / jupyter notebook】变换场景的同一行人多重识别
程序环境为高性能集群:
CPU:Intel Xeon Gold 6140 Processor * 2(共36核心)
内存:512GB RAM
GPU:Tesla P100-PCIE-16GB * 2
数据集和源代码可以在此处获得
tutorials:https://github.com/wnm1503303791/pycode/tree/master/gluoncv/re-id/baseline
In [ ]:#market1501.py #!/usr/bin/env python # -*- coding: utf-8 -*- from __future__ import print_function, division import json, os from os import path as osp from zipfile import ZipFile from gluoncv.utils import download def extract(fpath, exdir): print("Extracting zip file") with ZipFile(fpath) as z: z.extractall(path=exdir) print("Extracting Done") def make_list(exdir): train_dir = osp.join(exdir, "bounding_box_train") train_list = {} for _, _, files in os.walk(train_dir, topdown=False): for name in files: if '.jpg' in name: name_split = name.split('_') pid = name_split[0] pcam = name_split[1][1] if pid not in train_list: train_list[pid] = [] train_list[pid].append({"name":name, "pid":pid, "pcam":pcam}) with open(osp.join(exdir, 'train.txt'), 'w') as f: for i, key in enumerate(train_list): for item in train_list[key]: f.write(item['name']+" "+str(i)+" "+item["pcam"]+"\n") print("Make Label List Done") def main(): name = "Market-1501-v15.09.15" url = "http://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/"+name+".zip" root = osp.expanduser("~/.mxnet/datasets") if not os.path.exists(root): os.mkdir(root) fpath = osp.join(root, name+'.zip') exdir = osp.join(root, name) if os.path.exists(fpath): if not osp.isdir(exdir): extract(fpath, root) make_list(exdir) else: download(url, fpath, False) extract(fpath, root) make_list(exdir) if __name__ == '__main__': main()In [5]:
! python market1501.py
In [ ]:
#train.py from __future__ import division import argparse, datetime, os import logging logging.basicConfig(level=logging.INFO) import mxnet as mx from mxnet import gluon, nd from mxnet.gluon.model_zoo import vision as models from mxnet.gluon.data.vision import transforms from mxnet import autograd from networks import resnet18, resnet34, resnet50 from gluoncv.data.market1501.data_read import ImageTxtDataset from gluoncv.data.market1501.label_read import LabelList from gluoncv.data.transforms.block import RandomCrop # CLI parser = argparse.ArgumentParser(description='Train a model for image classification.') parser.add_argument('--img-height', type=int, default=384, help='the height of image for input') parser.add_argument('--img-width', type=int, default=128, help='the width of image for input') parser.add_argument('--batch-size', type=int, default=32, help='training batch size per device (CPU/GPU).') parser.add_argument('--num-workers', type=int, default=8, help='the number of workers for data loader') parser.add_argument('--dataset-root', type=str, default="~/.mxnet/datasets", help='the number of workers for data loader') parser.add_argument('--dataset', type=str, default="market1501", help='the number of workers for data loader') parser.add_argument('--num-gpus', type=int, default=1, help='number of gpus to use.') parser.add_argument('--warmup', type=bool, default=True, help='number of training epochs.') parser.add_argument('--epochs', type=str, default="5,25,50,75") parser.add_argument('--ratio', type=float, default=1., help="ratio of training set to all set") parser.add_argument('--pad', type=int, default=10) parser.add_argument('--lr', type=float, default=3.5e-4, help='learning rate. default is 0.1.') parser.add_argument('-momentum', type=float, default=0.9, help='momentum value for optimizer, default is 0.9.') parser.add_argument('--wd', type=float, default=5e-4, help='weight decay rate. default is 5e-4.') parser.add_argument('--seed', type=int, default=613, help='random seed to use. Default=613.') parser.add_argument('--lr-decay', type=int, default=0.1) parser.add_argument('--hybridize', type=bool, default=True) def get_data_iters(batch_size): train_set, val_set = LabelList(ratio=opt.ratio, root=opt.dataset_root, name=opt.dataset) normalizer = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) transform_train = transforms.Compose([ transforms.Resize(size=(opt.img_width, opt.img_height), interpolation=1), transforms.RandomFlipLeftRight(), RandomCrop(size=(opt.img_width, opt.img_height), pad=opt.pad), transforms.ToTensor(), normalizer]) train_imgs = ImageTxtDataset(train_set, transform=transform_train) train_data = gluon.data.DataLoader(train_imgs, batch_size, shuffle=True, last_batch='discard', num_workers=opt.num_workers) if opt.ratio < 1: transform_test = transforms.Compose([ transforms.Resize(size=(opt.img_width, opt.img_height), interpolation=1), transforms.ToTensor(), normalizer]) val_imgs = ImageTxtDataset(val_set, transform=transform_test) val_data = gluon.data.DataLoader(val_imgs, batch_size, shuffle=True, last_batch='discard', num_workers=opt.num_workers) else: val_data = None return train_data, val_data def validate(val_data, net, criterion, ctx): loss = 0.0 for data, label in val_data: data_list = gluon.utils.split_and_load(data, ctx) label_list = gluon.utils.split_and_load(label, ctx) with autograd.predict_mode(): outpus = [net(X) for X in data_list] losses = [criterion(X, y) for X, y in zip(outpus, label_list)] accuracy = [(X.argmax(axis=1)==y.astype('float32')).mean.asscalar() for X, y in zip(outpus, label_list)] loss_list = [l.mean().asscalar() for l in losses] loss += sum(loss_list) / len(loss_list) return loss/len(val_data), sum(accuracy)/len(accuracy) def main(net, batch_size, epochs, opt, ctx): train_data, val_data = get_data_iters(batch_size) if opt.hybridize: net.hybridize() trainer = gluon.Trainer(net.collect_params(), 'adam', {'learning_rate': opt.lr, 'wd': opt.wd}) criterion = gluon.loss.SoftmaxCrossEntropyLoss() lr = opt.lr if opt.warmup: minlr = lr*0.01 dlr = (lr-minlr)/(epochs[0]-1) prev_time = datetime.datetime.now() for epoch in range(epochs[-1]): _loss = 0. if opt.warmup: if epoch<epochs[0]: lr = minlr + dlr*epoch if epoch in epochs[1:]: lr = lr * opt.lr_decay trainer.set_learning_rate(lr) for data, label in train_data: data_list = gluon.utils.split_and_load(data, ctx) label_list = gluon.utils.split_and_load(label, ctx) with autograd.record(): output = [net(X) for X in data_list] losses = [criterion(X, y) for X, y in zip(output, label_list)] for l in losses: l.backward() trainer.step(batch_size) _loss_list = [l.mean().asscalar() for l in losses] _loss += sum(_loss_list) / len(_loss_list) cur_time = datetime.datetime.now() h, remainder = divmod((cur_time - prev_time).seconds, 3600) m, s = divmod(remainder, 60) time_str = "Time %02d:%02d:%02d" % (h, m, s) __loss = _loss/len(train_data) if val_data is not None: val_loss, val_accuracy = validate(val_data, net, criterion, ctx) epoch_str = ("Epoch %d. Train loss: %f, Val loss %f, Val accuracy %f, " % (epoch, __loss , val_loss, val_accuracy)) else: epoch_str = ("Epoch %d. Train loss: %f, " % (epoch, __loss)) prev_time = cur_time print(epoch_str + time_str + ', lr ' + str(trainer.learning_rate)) if not os.path.exists("params"): os.mkdir("params") net.save_parameters("params/resnet50.params") if __name__ == '__main__': opt = parser.parse_args() logging.info(opt) mx.random.seed(opt.seed) batch_size = opt.batch_size num_gpus = opt.num_gpus epochs = [int(i) for i in opt.epochs.split(',')] batch_size *= max(1, num_gpus) context = [mx.gpu(i) for i in range(num_gpus)] net = resnet50(ctx=context, num_classes=751) main(net, batch_size, epochs, opt, context)In [7]:
!pwd
/public/home/ztu/code/git/pycode/gluoncv/re-idIn [8]:
!nvidia-smi -L
GPU 0: Tesla P100-PCIE-16GB (UUID: GPU-1251aff4-dcda-c142-af7f-c19a67ed88df) GPU 1: Tesla P100-PCIE-16GB (UUID: GPU-ae5cde47-bf7f-a6c6-8a68-8a3c96b2dadf)In [9]:
!nvidia-smi
Tue Oct 22 16:15:17 2019 +-----------------------------------------------------------------------------+ | NVIDIA-SMI 396.37 Driver Version: 396.37 | |-------------------------------+----------------------+----------------------+ | GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. | |===============================+======================+======================| | 0 Tesla P100-PCIE... Off | 00000000:2F:00.0 Off | 0 | | N/A 48C P0 32W / 250W | 0MiB / 16280MiB | 0% Default | +-------------------------------+----------------------+----------------------+ | 1 Tesla P100-PCIE... Off | 00000000:86:00.0 Off | 0 | | N/A 43C P0 33W / 250W | 0MiB / 16280MiB | 0% Default | +-------------------------------+----------------------+----------------------+ +-----------------------------------------------------------------------------+ | Processes: GPU Memory | | GPU PID Type Process name Usage | |=============================================================================| | No running processes found | +-----------------------------------------------------------------------------+In [10]:
!CUDA_VISIBLE_DEVICES=1 python baseline/train.py
INFO:root:Namespace(batch_size=32, dataset='market1501', dataset_root='~/.mxnet/datasets', epochs='5,25,50,75', hybridize=True, img_height=384, img_width=128, lr=0.00035, lr_decay=0.1, momentum=0.9, num_gpus=1, num_workers=8, pad=10, ratio=1.0, seed=613, warmup=True, wd=0.0005) [16:15:34] src/operator/nn/./cudnn/./cudnn_algoreg-inl.h:97: Running performance tests to find the best convolution algorithm, this can take a while... (set the environment variable MXNET_CUDNN_AUTOTUNE_DEFAULT to 0 to disable) Epoch 0. Train loss: 6.597217, Time 00:01:38, lr 3.5e-06 Epoch 1. Train loss: 4.248931, Time 00:01:32, lr 9.012500000000001e-05 ^C Process ForkPoolWorker-7: Process ForkPoolWorker-2: Process ForkPoolWorker-5: Process ForkPoolWorker-6: Process ForkPoolWorker-3: Traceback (most recent call last): File "/public/home/ztu/app/anaconda3/envs/gluoncv/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap self.run() File "/public/home/ztu/app/anaconda3/envs/gluoncv/lib/python3.6/multiprocessing/process.py", line 93, in run self._target(*self._args, **self._kwargs) File "/public/home/ztu/app/anaconda3/envs/gluoncv/lib/python3.6/multiprocessing/pool.py", line 108, in worker task = get() File "/public/home/ztu/app/anaconda3/envs/gluoncv/lib/python3.6/multiprocessing/queues.py", line 334, in get with self._rlock: File "/public/home/ztu/app/anaconda3/envs/gluoncv/lib/python3.6/multiprocessing/synchronize.py", line 95, in __enter__ return self._semlock.__enter__() KeyboardInterrupt Process ForkPoolWorker-4: Process ForkPoolWorker-1: Traceback (most recent call last): Process ForkPoolWorker-8: File "/public/home/ztu/app/anaconda3/envs/gluoncv/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap self.run() File "/public/home/ztu/app/anaconda3/envs/gluoncv/lib/python3.6/multiprocessing/process.py", line 93, in run self._target(*self._args, **self._kwargs) File "/public/home/ztu/app/anaconda3/envs/gluoncv/lib/python3.6/multiprocessing/pool.py", line 108, in worker task = get() File "/public/home/ztu/app/anaconda3/envs/gluoncv/lib/python3.6/multiprocessing/queues.py", line 334, in get with self._rlock: File "/public/home/ztu/app/anaconda3/envs/gluoncv/lib/python3.6/multiprocessing/synchronize.py", line 95, in __enter__ return self._semlock.__enter__() KeyboardInterrupt Traceback (most recent call last): File "/public/home/ztu/app/anaconda3/envs/gluoncv/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap self.run() File "/public/home/ztu/app/anaconda3/envs/gluoncv/lib/python3.6/multiprocessing/process.py", line 93, in run self._target(*self._args, **self._kwargs) File "/public/home/ztu/app/anaconda3/envs/gluoncv/lib/python3.6/multiprocessing/pool.py", line 108, in worker task = get() File "/public/home/ztu/app/anaconda3/envs/gluoncv/lib/python3.6/multiprocessing/queues.py", line 334, in get with self._rlock: File "/public/home/ztu/app/anaconda3/envs/gluoncv/lib/python3.6/multiprocessing/synchronize.py", line 95, in __enter__ return self._semlock.__enter__() Traceback (most recent call last): KeyboardInterrupt File "/public/home/ztu/app/anaconda3/envs/gluoncv/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap self.run() File "/public/home/ztu/app/anaconda3/envs/gluoncv/lib/python3.6/multiprocessing/process.py", line 93, in run self._target(*self._args, **self._kwargs) File "/public/home/ztu/app/anaconda3/envs/gluoncv/lib/python3.6/multiprocessing/pool.py", line 108, in worker task = get() File "/public/home/ztu/app/anaconda3/envs/gluoncv/lib/python3.6/multiprocessing/queues.py", line 335, in get res = self._reader.recv_bytes() File "/public/home/ztu/app/anaconda3/envs/gluoncv/lib/python3.6/multiprocessing/connection.py", line 216, in recv_bytes buf = self._recv_bytes(maxlength) File "/public/home/ztu/app/anaconda3/envs/gluoncv/lib/python3.6/multiprocessing/connection.py", line 407, in _recv_bytes buf = self._recv(4) File "/public/home/ztu/app/anaconda3/envs/gluoncv/lib/python3.6/multiprocessing/connection.py", line 379, in _recv chunk = read(handle, remaining) KeyboardInterrupt Traceback (most recent call last): File "/public/home/ztu/app/anaconda3/envs/gluoncv/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap self.run() File "/public/home/ztu/app/anaconda3/envs/gluoncv/lib/python3.6/multiprocessing/process.py", line 93, in run self._target(*self._args, **self._kwargs) File "/public/home/ztu/app/anaconda3/envs/gluoncv/lib/python3.6/multiprocessing/pool.py", line 108, in worker task = get() File "/public/home/ztu/app/anaconda3/envs/gluoncv/lib/python3.6/multiprocessing/queues.py", line 334, in get with self._rlock: File "/public/home/ztu/app/anaconda3/envs/gluoncv/lib/python3.6/multiprocessing/synchronize.py", line 95, in __enter__ return self._semlock.__enter__() KeyboardInterrupt Traceback (most recent call last): Traceback (most recent call last): Traceback (most recent call last): File "/public/home/ztu/app/anaconda3/envs/gluoncv/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap self.run() File "/public/home/ztu/app/anaconda3/envs/gluoncv/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap self.run() File "/public/home/ztu/app/anaconda3/envs/gluoncv/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap self.run() File "/public/home/ztu/app/anaconda3/envs/gluoncv/lib/python3.6/multiprocessing/process.py", line 93, in run self._target(*self._args, **self._kwargs) File "/public/home/ztu/app/anaconda3/envs/gluoncv/lib/python3.6/multiprocessing/process.py", line 93, in run self._target(*self._args, **self._kwargs) File "/public/home/ztu/app/anaconda3/envs/gluoncv/lib/python3.6/multiprocessing/process.py", line 93, in run self._target(*self._args, **self._kwargs) File "/public/home/ztu/app/anaconda3/envs/gluoncv/lib/python3.6/multiprocessing/pool.py", line 108, in worker task = get() File "/public/home/ztu/app/anaconda3/envs/gluoncv/lib/python3.6/multiprocessing/pool.py", line 108, in worker task = get() File "/public/home/ztu/app/anaconda3/envs/gluoncv/lib/python3.6/multiprocessing/pool.py", line 108, in worker task = get() File "/public/home/ztu/app/anaconda3/envs/gluoncv/lib/python3.6/multiprocessing/queues.py", line 334, in get with self._rlock: File "/public/home/ztu/app/anaconda3/envs/gluoncv/lib/python3.6/multiprocessing/queues.py", line 334, in get with self._rlock: File "/public/home/ztu/app/anaconda3/envs/gluoncv/lib/python3.6/multiprocessing/queues.py", line 334, in get with self._rlock: File "/public/home/ztu/app/anaconda3/envs/gluoncv/lib/python3.6/multiprocessing/synchronize.py", line 95, in __enter__ return self._semlock.__enter__() File "/public/home/ztu/app/anaconda3/envs/gluoncv/lib/python3.6/multiprocessing/synchronize.py", line 95, in __enter__ return self._semlock.__enter__() File "/public/home/ztu/app/anaconda3/envs/gluoncv/lib/python3.6/multiprocessing/synchronize.py", line 95, in __enter__ return self._semlock.__enter__() KeyboardInterrupt KeyboardInterrupt KeyboardInterrupt Traceback (most recent call last): File "baseline/train.py", line 168, inmain(net, batch_size, epochs, opt, context) File "baseline/train.py", line 133, in main _loss_list = [l.mean().asscalar() for l in losses] File "baseline/train.py", line 133, in _loss_list = [l.mean().asscalar() for l in losses] File "/public/home/ztu/app/anaconda3/envs/gluoncv/lib/python3.6/site-packages/mxnet/ndarray/ndarray.py", line 2014, in asscalar return self.asnumpy()[0] File "/public/home/ztu/app/anaconda3/envs/gluoncv/lib/python3.6/site-packages/mxnet/ndarray/ndarray.py", line 1996, in asnumpy ctypes.c_size_t(data.size))) KeyboardInterrupt
其实我早就训练好了...
所以就省略gpu跑训练代码的输出过程
下面直接上测试代码吧
In [ ]:#test.py # -*- coding: utf-8 -*- from __future__ import print_function, division import mxnet as mx import numpy as np from mxnet import gluon, nd from mxnet.gluon import nn from mxnet.gluon.data.vision import transforms from networks import resnet18, resnet34, resnet50 from gluoncv.data.market1501.data_read import ImageTxtDataset import time, os, sys import scipy.io as sio from os import path as osp def get_data(batch_size, test_set, query_set): normalizer = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) transform_test = transforms.Compose([ transforms.Resize(size=(128, 384), interpolation=1), transforms.ToTensor(), normalizer]) test_imgs = ImageTxtDataset(test_set, transform=transform_test) query_imgs = ImageTxtDataset(query_set, transform=transform_test) test_data = gluon.data.DataLoader(test_imgs, batch_size, shuffle=False, last_batch='keep', num_workers=8) query_data = gluon.data.DataLoader(query_imgs, batch_size, shuffle=False, last_batch='keep', num_workers=8) return test_data, query_data def load_network(network, ctx): network.load_parameters('params/resnet50.params', ctx=ctx, allow_missing=True, ignore_extra=True) return network def fliplr(img): '''flip horizontal''' img_flip = nd.flip(img, axis=3) return img_flip def extract_feature(model, dataloaders, ctx): count = 0 features = [] for img, _ in dataloaders: n = img.shape[0] count += n print(count) ff = np.zeros((n, 2048)) for i in range(2): if(i==1): img = fliplr(img) f = model(img.as_in_context(ctx)).as_in_context(mx.cpu()).asnumpy() ff = ff+f features.append(ff) features = np.concatenate(features) return features/np.linalg.norm(features, axis=1, keepdims=True) def get_id(img_path): cameras = [] labels = [] for path in img_path: cameras.append(int(path[0].split('/')[-1].split('_')[1][1])) labels.append(path[1]) return np.array(cameras), np.array(labels) def compute_mAP(index, good_index, junk_index): ap = 0 cmc = np.zeros(len(index)) if good_index.size==0: # if empty cmc[0] = -1 return ap,cmc # remove junk_index mask = np.in1d(index, junk_index, invert=True) index = index[mask] # find good_index index ngood = len(good_index) mask = np.in1d(index, good_index) rows_good = np.argwhere(mask==True) rows_good = rows_good.flatten() cmc[rows_good[0]:] = 1 for i in range(ngood): d_recall = 1.0/ngood precision = (i+1)*1.0/(rows_good[i]+1) if rows_good[i]!=0: old_precision = i*1.0/rows_good[i] else: old_precision=1.0 ap = ap + d_recall*(old_precision + precision)/2 return ap, cmc if __name__ == '__main__': batch_size = 256 data_dir = osp.expanduser("~/.mxnet/datasets/Market-1501-v15.09.15/") gpu_ids = [0] # set gpu ids if len(gpu_ids)>0: context = mx.gpu() test_set = [(osp.join(data_dir,'bounding_box_test',line), int(line.split('_')[0])) for line in os.listdir(data_dir+'bounding_box_test') if "jpg" in line and "-1" not in line] query_set = [(osp.join(data_dir,'query',line), int(line.split('_')[0])) for line in os.listdir(data_dir+'query') if "jpg" in line] test_cam, test_label = get_id(test_set) query_cam, query_label = get_id(query_set) ###################################################################### # Load Collected data Trained model model_structure = resnet50(ctx=context, pretrained=False) model = load_network(model_structure, context) # Extract feature test_loader, query_loader = get_data(batch_size, test_set, query_set) print('start test') test_feature = extract_feature(model, test_loader, context) print('start query') query_feature = extract_feature(model, query_loader, context) query_feature = nd.array(query_feature).as_in_context(mx.gpu(0)) test_feature = nd.array(test_feature).as_in_context(mx.gpu(0)) num = query_label.size dist_all = nd.linalg.gemm2(query_feature, test_feature, transpose_b=True) CMC = np.zeros(test_label.size) ap = 0.0 for i in range(num): cam = query_cam[i] label = query_label[i] index = dist_all[i].argsort(is_ascend=False).as_in_context(mx.cpu()).asnumpy().astype("int32") query_index = np.argwhere(test_label==label) camera_index = np.argwhere(test_cam==cam) good_index = np.setdiff1d(query_index, camera_index, assume_unique=True) junk_index = np.intersect1d(query_index, camera_index) ap_tmp, CMC_tmp = compute_mAP(index, good_index, junk_index) CMC = CMC + CMC_tmp ap += ap_tmp CMC = CMC/num #average CMC print('top1:%f top5:%f top10:%f mAP:%f'%(CMC[0],CMC[4],CMC[9],ap/num))In [15]:
!CUDA_VISIBLE_DEVICES=1 python baseline/test.py
start test 256 [16:25:25] src/operator/nn/./cudnn/./cudnn_algoreg-inl.h:97: Running performance tests to find the best convolution algorithm, this can take a while... (set the environment variable MXNET_CUDNN_AUTOTUNE_DEFAULT to 0 to disable) 512 768 1024 1280 1536 1792 2048 2304 2560 2816 3072 3328 3584 3840 4096 4352 4608 4864 5120 5376 5632 5888 6144 6400 6656 6912 7168 7424 7680 7936 8192 8448 8704 8960 9216 9472 9728 9984 10240 10496 10752 11008 11264 11520 11776 12032 12288 12544 12800 13056 13312 13568 13824 14080 14336 14592 14848 15104 15360 15616 15872 15913 start query 256 512 768 1024 1280 1536 1792 2048 2304 2560 2816 3072 3328 3368 [16:27:09] src/operator/nn/./cudnn/./cudnn_algoreg-inl.h:97: Running performance tests to find the best convolution algorithm, this can take a while... (set the environment variable MXNET_CUDNN_AUTOTUNE_DEFAULT to 0 to disable) top1:0.921021 top5:0.971793 top10:0.980701 mAP:0.794266In [ ]:
tz@croplab,HZAU