-
Notifications
You must be signed in to change notification settings - Fork 0
/
Embed_Dive.py
194 lines (158 loc) · 7.35 KB
/
Embed_Dive.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
import LargeVis
import argparse
import fileinput #for reading large files
import json
import random
import numpy as np
import os
import shutil
import csv
import math
from datetime import datetime
import argparse as argp
parser = argparse.ArgumentParser(description = 'This script performs embedding of a sparse similarity matrix into coordinates and opens the DiVE viewer with the result .')
parser.add_argument('-fea', default = 0, type = int, help = 'whether to visualize high-dimensional feature vectors or networks')
parser.add_argument('-input', default = '', help = 'input file')
parser.add_argument('-output', default = '', help = 'output coordinates file')
parser.add_argument('-outdim', default = -1, type = int, help = 'output dimensionality')
parser.add_argument('-threads', default = -1, type = int, help = 'number of training threads')
parser.add_argument('-samples', default = -1, type = int, help = 'number of training mini-batches')
parser.add_argument('-prop', default = -1, type = int, help = 'number of propagations')
parser.add_argument('-alpha', default = -1, type = float, help = 'learning rate')
parser.add_argument('-trees', default = -1, type = int, help = 'number of rp-trees')
parser.add_argument('-neg', default = -1, type = int, help = 'number of negative samples')
parser.add_argument('-neigh', default = -1, type = int, help = 'number of neighbors in the NN-graph')
parser.add_argument('-gamma', default = -1, type = float, help = 'weight assigned to negative edges')
parser.add_argument('-perp', default = -1, type = float, help = 'perplexity for the NN-grapn')
parser.add_argument(
'-metadata',
default = 'No',
dest = 'metaDataFile',
help = 'Input file containing the properties(text) accompanying the data. Format: [id] [metadata] . Metadata format: "first_line" "second_line" "third_line" ..."n_line"')
parser.add_argument(
'-dir',
default = os.getcwd(),
dest = 'baseDir',
help = 'Base directory to store output files')
parser.add_argument(
'-np',
default = 'No',
dest = 'namesOfPropertiesFile',
help = 'A json file containing list of properties names. Ex ["Name", "DBSCAN label", "K-means label"]')
parser.add_argument(
'-json',
default = 'data.json',
dest = 'jsonFileName',
help = 'Name of the output json file, which is input to DiVE')
parser.add_argument(
'-divedir',
default = os.getcwd(),
dest = 'diveDir',
help = 'Directory where DiVE resides')
args = parser.parse_args()
if args.fea == 1:
LargeVis.loadfile(args.input)
else:
LargeVis.loadgraph(args.input)
Y = LargeVis.run(args.outdim, args.threads, args.samples, args.prop, args.alpha, args.trees, args.neg, args.neigh, args.gamma, args.perp)
LargeVis.save(args.output)
########################## LargeVis has finished, now we generate data for DiVE
def ReadMetaDataFile(metaDataFile):
"""File format: [id] [metadata]
metadata format: "first_line" "second_line" "third_line" ... "n_line" """
metaDataDict = dict()
for line in fileinput.input([metaDataFile]):
if line != "\n":
for items in csv.reader([line], delimiter=' ', quotechar='"'):
id = items[0]
items.pop(0)
metaDataDict[id] = items
return metaDataDict
def ReadCoordinates(file):
fixed = dict()
maxabs = 0
for line in fileinput.input([file]):
if line != "\n":
items = line.split()
if len(items) > 2:# to skip the first line
if (len(items) ==3): #if dimension == 2
maxabs = max(abs(float(items[1])), abs(float(items[2])), maxabs)
fixed[items[0]] = [ 0.1, float(items[1]), float(items[2])] # add artificial x-dimension. must be non-zero
else:
maxabs = max(abs(float(items[1])), max(abs(float(items[2])), abs(float(items[3]))), maxabs)
fixed[items[0]] = [float(items[1]), float(items[2]), float(items[3])]
for key in fixed.keys():
lis = fixed[key]
fixed[key] = [lis[0]/maxabs, lis[1]/maxabs, lis[2]/maxabs]
return fixed
def CreateSmallDataJSONFile(allPoints, startingFolder, jsonfilename):
string = json.dumps(allPoints)
if jsonfilename == "None":
jsonfilename = "data.json"
if startingFolder == "None":
startingFolder = os.getcwd()
print(startingFolder)
print(jsonfilename)
filepath = os.path.join(startingFolder, jsonfilename)
file = open(filepath, "w")
file.write(string)
file.close()
file = open(os.path.join( args.diveDir, "DiVE", "data", "data.js"), "w")
string = "const data_all = " + string
file.write(string)
file.close()
def CreatePointsDictionary(fixedCoordinates, metaDataDict, namesOfPropertiesFile):
pointsDict = dict()
if namesOfPropertiesFile != "No":
with open(namesOfPropertiesFile) as json_data:
list = json.load(json_data)
pointsDict["NamesOfProperties"] = list
for key in fixedCoordinates.keys():
point = dict()
point["Coordinates"] = fixedCoordinates[key]
if (metaDataDict != "no" ):
if key in metaDataDict:
point["Properties"] = metaDataDict[key]
else:
point["Properties"] = []
else:
point["Properties"] = []
pointsDict[key] = point
return pointsDict
def CreateDirIfDoesNotExist(dirname):
if not os.path.exists(dirname):
os.makedirs(dirname)
def RemoveDirTreeIfExists(dirname):
if os.path.exists(dirname):
shutil.rmtree(dirname)
def ConvertCoordinatesToList(fixedCoordinate):
for key in fixedCoordinate:
fixedCoordinate[key] = list(fixedCoordinate[key])
def Workflow(coordinatesFile, metaDataFile, namesOfPropertiesFile, baseDir = os.getcwd(), jsonfilename = "data.json"):
"""Produces the input for DiVE.
coordinatesFile is the output of LargeVis
metaDataFile contains info about the photos
namesOfPropeties file contains a list of names of properties. Ex ["Name", "DBSCAN label", "K-means label"]
baseDir - where to write output
jsonfilename - the name of the output file
"""
dirname1 = baseDir;
print(str(datetime.now()) + ": Reading input files...")
if metaDataFile != "No":
metaDataDict = ReadMetaDataFile(metaDataFile)
else:
metaDataDict = "no"
fixedCoordinate = ReadCoordinates(coordinatesFile)
ConvertCoordinatesToList(fixedCoordinate)
pointsDict = CreatePointsDictionary(fixedCoordinate, metaDataDict, namesOfPropertiesFile)
print(str(datetime.now()) + ": Start writing output...")
CreateDirIfDoesNotExist(dirname1)
CreateSmallDataJSONFile(pointsDict, dirname1, jsonfilename)
print(str(datetime.now()) + ": Finished writing output.")
if __name__ == "__main__":
Workflow(args.output, args.metaDataFile, args.namesOfPropertiesFile, args.baseDir, args.jsonFileName)
# now opening the browser
import webbrowser
new = 2 # open in a new tab, if possible
url = args.diveDir + "/DiVE/index.html"
webbrowser.open(url,new=new)