-
Notifications
You must be signed in to change notification settings - Fork 0
/
load.py
78 lines (65 loc) · 3.01 KB
/
load.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" Module with an only class that simply loads the models
Date: 23.08.2018
Author: cristinae
"""
import sys
import resource
import os.path
import easyBPE
from gensim.models import KeyedVectors
class QueryTrad:
def __init__(self, modelPath):
# File location
# modelPath = "../models/"
self.modelPath =modelPath
self.ctFile = modelPath + "CT/meshSplit2.solr.all-languages.txt"
self.swFile = modelPath + "DeEnEsFr.plain.sw"
self.BPEcodes = modelPath + "L1L2.final.bpe"
self.embeddingRoot = modelPath + "embeddingsL1solr."
# Load stopword plain file
swList = {}
for line in open(self.swFile):
line = line.strip()
swList[line] = 1
mem=resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
print("..List of stop words loaded. Using "+str(mem)+" MB of memory")
self.swList = swList
# Load the lexicon
ctDict = {}
targets = []
# data ordered as es->fr->de->en so that in case of ambiguity 'en' remains
for line in open(self.ctFile):
line = line.strip()
if (line.startswith('<<<')): #the new version of the dictionary has a comment line
continue
source, targets = line.split("|||",1)
ctDict[source] = targets
mem=resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
print("..Multilingual lexicon loaded. Using "+str(mem)+" MB of memory")
self.ctDict = ctDict
# Load BPE model
self.bpe = easyBPE.BPE(self.BPEcodes)
mem=resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
print("..BPE model loaded. Using "+str(mem)+" MB of memory")
# Load embeddings
self.embeddingL1 = KeyedVectors.load_word2vec_format(self.embeddingRoot+'w2v', binary=False)
mem=resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
print("\n..Multilingual embeddings loaded. Using "+str(mem)+" MB of memory")
self.embeddingEn = KeyedVectors.load_word2vec_format(self.embeddingRoot+'en.w2v')
mem=resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
print("..English embeddings loaded. Using "+str(mem)+" MB of memory")
self.embeddingEs = KeyedVectors.load_word2vec_format(self.embeddingRoot+'es.w2v')
mem=resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
print("..Spanish embeddings loaded. Using "+str(mem)+" MB of memory")
self.embeddingDe = KeyedVectors.load_word2vec_format(self.embeddingRoot+'de.w2v')
mem=resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
print("..German embeddings loaded. Using "+str(mem)+" MB of memory")
self.embeddingFr = KeyedVectors.load_word2vec_format(self.embeddingRoot+'fr.w2v')
mem=resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
print("..French embeddings loaded. Using "+str(mem)+" MB of memory\n")
def getCtDict(self):
return self.ctDict
def getSWlist(self):
return self.swList