generatenicelda.py

# creates the nice .html page
# assumes that pdftowordcloud.py, pdftothumbs.py and scrape.py were already run

import pickle
from numpy import argmax, zeros, ones
from math import log

# load the pickle of papers scraped from the HTML page (result of scrape.py)
paperdict = pickle.load(open( "papers.p", "rb" ))
print("Loaded %d papers from papers.p (generated by scrape.py)" % (len(paperdict), ))

# load the top word frequencies (result of pdftowordcloud.py)
topdict = pickle.load(open("topwords.p", "rb"))
print("Loaded %d entries from topwords.p (generated by pdftowordcloud.py)" % (len(topdict), ))

# load LDA words and invert their dictionary list
(ldak, phi, voca) = pickle.load(open("ldaphi.p", "rb"))
wtoid = {}
for i,w in enumerate(voca):
	wtoid[w] = i

# compute pairwise distances between papers based on top words
# using something similar to tfidf, but simpler. No vectors
# will be normalized or otherwise harmed during this computation.
# first compute inverse document frequency (idf)
N = len(paperdict) # number of documents
idf = {}
for pid,p in enumerate(paperdict):
	tw = topdict.get(p, []) # top 100 words
	ts = [x[0] for x in tw]
	for t in ts:
		idf[t] = idf.get(t, 0.0) + 1.0
for t in idf:
	idf[t] = log(N/idf[t], 2)

# now compute weighted intersection
ds = zeros((N, N))
for pid,p in enumerate(paperdict):
	tw = topdict.get(p, [])
	w = set([x[0] for x in tw]) # just the words
	accum = 0.0

	for pid2, p2 in enumerate(paperdict):
		if pid2<pid: continue
		tw2= topdict.get(p2, [])
		w2 = set([x[0] for x in tw2]) # just the words

		# tw and tw2 are top 100 words as (word, count) in both papers. Compute
		# the intersection!
		winter = w.intersection(w2)
		score = sum([idf[x] for x in winter])
		ds[pid, pid2] = score
		ds[pid2, pid] = score

# build up the string for html
html = open("cvprnice_template.html", "r").read()
s = ""
js = "ldadist=["
js2 = "pairdists=["


def link_url(abstract):
	# add working links to the code for each abstract.
	# This may require human interaction, if the rules
	# are not satisfied, but for nearly all papers this works.
	# assumes max of one link in the abstract

	# locates the starting location of the url in the abstract
	index = abstract.find('http')

	# checks to make sure 'http' is actually in the abstract
	if index != -1:
		# initialize the url to be everything after 'http'
		url = abstract[index:]

		# find the first space that occurs after 'http' is found
		first_space = url.find(' ')

		# if the url is in the middle of the paragraph,
		# stop at the first space that occurs.
		if first_space != -1:
			url = url[:first_space]

		# removes a .) or ). if its the last part of the url
		# this commonly occurs if the url ends a sentence
		for _ in range(2):
			if url[-1] == '.' or url[-1] == ')':
				url = url[:-1]

		# replace the url with a linked url
		abstract = abstract.replace(url, '<a href="%s" target="_blank">%s</a>' % (url, url))

	return abstract


def add_supp(supp):
	""" Adds supplemental material, if supplemental material exists. """
	if not supp:
		return ''
	return '<a href="%s" target="_blank">[supp]</a>' % (supp,)


for pid, p in enumerate(paperdict):
	# pid goes 1...N, p are the keys

	print("PROCESSING:", pid, '/', len(paperdict))

	# get title, author, bib
	title, author, bib, supp = paperdict[p]

	# create the tags string
	topwords = topdict.get(p, [])
	# some top100 words may not have been computed during LDA so exclude them if
	# they aren't found in wtoid
	t = [x[0] for x in topwords if x[0] in wtoid] 
	tid = [int(argmax(phi[:, wtoid[x]])) for x in t] # assign each word to class
	tcat = ""
	for k in range(ldak):
		ws = [x for i,x in enumerate(t) if tid[i]==k]
		tcat += '[<span class="t'+ str(k) + '">' + ", ".join(ws) + '</span>] '
	
	# count up the complete distribution for the entire document and build up
	# a javascript vector storing all this
	svec = zeros(ldak)
	for w in t: 
		svec += phi[:, wtoid[w]]
	if svec.sum() == 0: 
		svec = ones(ldak)/ldak
	else: 
		svec = svec / svec.sum() # normalize
	nums = [0 for k in range(ldak)]
	for k in range(ldak): 
		nums[k] = "%.2f" % (float(svec[k]), )
	
	js += "[" + ",".join(nums) + "]"
	if not pid == len(paperdict)-1: js += ","

	# dump similarities of this document to others
	scores = ["%.2f" % (float(ds[pid, i]),) for i in range(N)]
	js2 += "[" + ",".join(scores) + "]"
	if not pid == len(paperdict)-1: js2 += ","

	# get path to thumbnails for this paper
	thumbpath = "thumbs/%s.jpg" % (p, )

	# get links to PDF on CVPR servers
	pdflink = "http://openaccess.thecvf.com/content_CVPR_2020/papers/%s_CVPR_2020_paper.pdf" % (p, )

	s += """

	<div class="apaper" id="pid%d">
	<div class="paperdesc">
		<span class="ts">%s</span><br />
		<span class="as">%s</span><br /><br />
	</div>
	<div class="dllinks">
		<a href="%s" target="_blank">[pdf]</a>
		%s
		<span class="copybib" id="bibclick%d">[copy bibtex]</span>
		<span class="ldaclick" id="ldaclick%d">[LDA topics]</span>
		<span class="sim" id="sim%d">[show similar]</span>
		<span class="absclick" id="abclick%d">[abstract]</span>
	</div>
	<a href="%s" class="image_link" target="_blank"><img src = "%s"></a><br />
	<div class="pinfo">
		<div class="abstr" id="abdis%d">%s</div>
		<div class="topics" id="todis%d">%s</div>
		<div class="bib" id="bibdis%d">%s</div>
	</div>
	</div>

	""" % (pid, # main div id
		   title, # title
		   author, # authors
		   pdflink, # pdf link
		   add_supp(supp), # supplemental material
		   pid, # bib num
		   pid, # lda num
		   pid, # sim num
		   pid, # abs num
		   pdflink, # link the image to the pdf
		   thumbpath, # image
		   pid, # abs display num
		   link_url(open('abstracts/%s_CVPR_2020_paper.txt' % (p,), 'r').read()), # abstract
		   pid, # topics display num
		   tcat, # topics
		   pid, # bib display num
		   bib # bib text
		  )

newhtml = html.replace("RESULTTABLE", s)

js += "]"
newhtml = newhtml.replace("LOADDISTS", js)

js2 += "]"
newhtml = newhtml.replace("PAIRDISTS", js2)

f = open("cvprnice.html", "w")
f.write(newhtml)
f.close()