forked from mattdeitke/CVPR-Accepted-Papers-Viewer
-
Notifications
You must be signed in to change notification settings - Fork 0
/
generatenicelda.py
201 lines (164 loc) · 5.89 KB
/
generatenicelda.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
# creates the nice .html page
# assumes that pdftowordcloud.py, pdftothumbs.py and scrape.py were already run
import pickle
from numpy import argmax, zeros, ones
from math import log
# load the pickle of papers scraped from the HTML page (result of scrape.py)
paperdict = pickle.load(open( "papers.p", "rb" ))
print("Loaded %d papers from papers.p (generated by scrape.py)" % (len(paperdict), ))
# load the top word frequencies (result of pdftowordcloud.py)
topdict = pickle.load(open("topwords.p", "rb"))
print("Loaded %d entries from topwords.p (generated by pdftowordcloud.py)" % (len(topdict), ))
# load LDA words and invert their dictionary list
(ldak, phi, voca) = pickle.load(open("ldaphi.p", "rb"))
wtoid = {}
for i,w in enumerate(voca):
wtoid[w] = i
# compute pairwise distances between papers based on top words
# using something similar to tfidf, but simpler. No vectors
# will be normalized or otherwise harmed during this computation.
# first compute inverse document frequency (idf)
N = len(paperdict) # number of documents
idf = {}
for pid,p in enumerate(paperdict):
tw = topdict.get(p, []) # top 100 words
ts = [x[0] for x in tw]
for t in ts:
idf[t] = idf.get(t, 0.0) + 1.0
for t in idf:
idf[t] = log(N/idf[t], 2)
# now compute weighted intersection
ds = zeros((N, N))
for pid,p in enumerate(paperdict):
tw = topdict.get(p, [])
w = set([x[0] for x in tw]) # just the words
accum = 0.0
for pid2, p2 in enumerate(paperdict):
if pid2<pid: continue
tw2= topdict.get(p2, [])
w2 = set([x[0] for x in tw2]) # just the words
# tw and tw2 are top 100 words as (word, count) in both papers. Compute
# the intersection!
winter = w.intersection(w2)
score = sum([idf[x] for x in winter])
ds[pid, pid2] = score
ds[pid2, pid] = score
# build up the string for html
html = open("cvprnice_template.html", "r").read()
s = ""
js = "ldadist=["
js2 = "pairdists=["
def link_url(abstract):
# add working links to the code for each abstract.
# This may require human interaction, if the rules
# are not satisfied, but for nearly all papers this works.
# assumes max of one link in the abstract
# locates the starting location of the url in the abstract
index = abstract.find('http')
# checks to make sure 'http' is actually in the abstract
if index != -1:
# initialize the url to be everything after 'http'
url = abstract[index:]
# find the first space that occurs after 'http' is found
first_space = url.find(' ')
# if the url is in the middle of the paragraph,
# stop at the first space that occurs.
if first_space != -1:
url = url[:first_space]
# removes a .) or ). if its the last part of the url
# this commonly occurs if the url ends a sentence
for _ in range(2):
if url[-1] == '.' or url[-1] == ')':
url = url[:-1]
# replace the url with a linked url
abstract = abstract.replace(url, '<a href="%s" target="_blank">%s</a>' % (url, url))
return abstract
def add_supp(supp):
""" Adds supplemental material, if supplemental material exists. """
if not supp:
return ''
return '<a href="%s" target="_blank">[supp]</a>' % (supp,)
for pid, p in enumerate(paperdict):
# pid goes 1...N, p are the keys
print("PROCESSING:", pid, '/', len(paperdict))
# get title, author, bib
title, author, bib, supp = paperdict[p]
# create the tags string
topwords = topdict.get(p, [])
# some top100 words may not have been computed during LDA so exclude them if
# they aren't found in wtoid
t = [x[0] for x in topwords if x[0] in wtoid]
tid = [int(argmax(phi[:, wtoid[x]])) for x in t] # assign each word to class
tcat = ""
for k in range(ldak):
ws = [x for i,x in enumerate(t) if tid[i]==k]
tcat += '[<span class="t'+ str(k) + '">' + ", ".join(ws) + '</span>] '
# count up the complete distribution for the entire document and build up
# a javascript vector storing all this
svec = zeros(ldak)
for w in t:
svec += phi[:, wtoid[w]]
if svec.sum() == 0:
svec = ones(ldak)/ldak
else:
svec = svec / svec.sum() # normalize
nums = [0 for k in range(ldak)]
for k in range(ldak):
nums[k] = "%.2f" % (float(svec[k]), )
js += "[" + ",".join(nums) + "]"
if not pid == len(paperdict)-1: js += ","
# dump similarities of this document to others
scores = ["%.2f" % (float(ds[pid, i]),) for i in range(N)]
js2 += "[" + ",".join(scores) + "]"
if not pid == len(paperdict)-1: js2 += ","
# get path to thumbnails for this paper
thumbpath = "thumbs/%s.jpg" % (p, )
# get links to PDF on CVPR servers
pdflink = "http://openaccess.thecvf.com/content_CVPR_2020/papers/%s_CVPR_2020_paper.pdf" % (p, )
s += """
<div class="apaper" id="pid%d">
<div class="paperdesc">
<span class="ts">%s</span><br />
<span class="as">%s</span><br /><br />
</div>
<div class="dllinks">
<a href="%s" target="_blank">[pdf]</a>
%s
<span class="copybib" id="bibclick%d">[copy bibtex]</span>
<span class="ldaclick" id="ldaclick%d">[LDA topics]</span>
<span class="sim" id="sim%d">[show similar]</span>
<span class="absclick" id="abclick%d">[abstract]</span>
</div>
<a href="%s" class="image_link" target="_blank"><img src = "%s"></a><br />
<div class="pinfo">
<div class="abstr" id="abdis%d">%s</div>
<div class="topics" id="todis%d">%s</div>
<div class="bib" id="bibdis%d">%s</div>
</div>
</div>
""" % (pid, # main div id
title, # title
author, # authors
pdflink, # pdf link
add_supp(supp), # supplemental material
pid, # bib num
pid, # lda num
pid, # sim num
pid, # abs num
pdflink, # link the image to the pdf
thumbpath, # image
pid, # abs display num
link_url(open('abstracts/%s_CVPR_2020_paper.txt' % (p,), 'r').read()), # abstract
pid, # topics display num
tcat, # topics
pid, # bib display num
bib # bib text
)
newhtml = html.replace("RESULTTABLE", s)
js += "]"
newhtml = newhtml.replace("LOADDISTS", js)
js2 += "]"
newhtml = newhtml.replace("PAIRDISTS", js2)
f = open("cvprnice.html", "w")
f.write(newhtml)
f.close()