-
Notifications
You must be signed in to change notification settings - Fork 0
/
test_beam.py
121 lines (94 loc) · 4.08 KB
/
test_beam.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import argparse
import numpy as np
import time
import pickle
import itertools
import os.path as osp
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from torch.autograd import Variable
from torch.utils.data import DataLoader
from tqdm import tqdm
from beamsearch import beamsearch
from coco_loader import coco_loader
from torchvision import models
from convcap import convcap
from vggfeats import Vgg16Feats
from evaluate import language_eval
def repeat_img(args, img_emb):
b = img_emb.size(0)
assert img_emb.dim() == 2
img_emb_new = Variable(img_emb.data.new(img_emb.size(0)*args.beam_size, img_emb.size(1))).cuda()
for k in range(b):
start_idx = k * args.beam_size
img_emb_new[start_idx:start_idx+args.beam_size, :] = img_emb[k, :].repeat(args.beam_size, 1)
return img_emb_new
def test_beam(args, split, modelfn=None):
"""Sample generation with beam-search"""
t_start = time.time()
data = coco_loader(args.coco_root, split=split, ncap_per_img=1)
print('[DEBUG] Loading %s data ... %f secs' % (split, time.time() - t_start))
data_loader = DataLoader(dataset=data, num_workers=args.nthreads,\
batch_size=args.batchsize, shuffle=False, drop_last=True)
batchsize = args.batchsize
max_tokens = data.max_tokens
num_batches = np.int_(np.floor((len(data.ids)*1.)/batchsize))
print('[DEBUG] Running test (w/ beam search) on %d batches' % num_batches)
model_imgcnn = Vgg16Feats()
model_imgcnn.cuda()
model_convcap = convcap(data.numwords, args.num_layers, is_attention=args.attention)
model_convcap.cuda()
print('[DEBUG] Loading checkpoint %s' % modelfn)
checkpoint = torch.load(modelfn)
model_convcap.load_state_dict(checkpoint['state_dict'])
model_imgcnn.load_state_dict(checkpoint['img_state_dict'])
model_imgcnn.train(False)
model_convcap.train(False)
pred_captions = []
for batch_idx, (imgs, _, _, _, img_ids) in \
tqdm(enumerate(data_loader), total=num_batches):
imgs = imgs.view(batchsize, 3, 224, 224)
imgs_v = Variable(imgs.cuda())
imgsfeats, imgsfc7 = model_imgcnn(imgs_v)
b, f_dim, f_h, f_w = imgsfeats.size()
imgsfeats = imgsfeats.unsqueeze(1).expand(\
b, args.beam_size, f_dim, f_h, f_w)
imgsfeats = imgsfeats.contiguous().view(\
b*args.beam_size, f_dim, f_h, f_w)
beam_searcher = beamsearch(args.beam_size, batchsize, max_tokens)
wordclass_feed = np.zeros((args.beam_size*batchsize, max_tokens), dtype='int64')
wordclass_feed[:,0] = data.wordlist.index('<S>')
imgsfc7 = repeat_img(args, imgsfc7)
outcaps = np.empty((batchsize, 0)).tolist()
for j in range(max_tokens-1):
wordclass = Variable(torch.from_numpy(wordclass_feed)).cuda()
wordact, _ = model_convcap(imgsfeats, imgsfc7, wordclass)
wordact = wordact[:,:,:-1]
wordact_j = wordact[..., j]
beam_indices, wordclass_indices = beam_searcher.expand_beam(wordact_j)
if len(beam_indices) == 0 or j == (max_tokens-2): # Beam search is over.
generated_captions = beam_searcher.get_results()
for k in range(batchsize):
g = generated_captions[:, k]
outcaps[k] = [data.wordlist[x] for x in g]
else:
wordclass_feed = wordclass_feed[beam_indices]
imgsfc7 = imgsfc7.index_select(0, Variable(torch.cuda.LongTensor(beam_indices)))
imgsfeats = imgsfeats.index_select(0, Variable(torch.cuda.LongTensor(beam_indices)))
for i, wordclass_idx in enumerate(wordclass_indices):
wordclass_feed[i, j+1] = wordclass_idx
for j in range(batchsize):
num_words = len(outcaps[j])
if 'EOS' in outcaps[j]:
num_words = outcaps[j].index('EOS')
outcap = ' '.join(outcaps[j][:num_words])
pred_captions.append({'image_id': img_ids[j], 'caption': outcap})
scores = language_eval(pred_captions, args.model_dir, split)
model_imgcnn.train(True)
model_convcap.train(True)
return scores