-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdocuments_containing_word.py
209 lines (193 loc) · 18 KB
/
documents_containing_word.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
# coding=utf-8
import os
import re
import unicodedata
from mrjob.job import MRJob
from mrjob.step import MRStep
# For each word, calculates which documents contain it using the given documents.
class MRDocumentsContainingWord(MRJob):
STOP_WORDS_ES = ["a", "actualmente", "acuerdo", "adelante", "ademas", "además", "adrede", "afirmó", "agregó", "ahi",
"ahora", "ahí", "al", "algo", "alguna", "algunas", "alguno", "algunos", "algún", "alli", "allí",
"alrededor", "ambos", "ampleamos", "antano", "antaño", "ante", "anterior", "antes", "apenas",
"aproximadamente", "aquel", "aquella", "aquellas", "aquello", "aquellos", "aqui", "aquél",
"aquélla", "aquéllas", "aquéllos", "aquí", "arriba", "arribaabajo", "aseguró", "asi", "así",
"atras", "aun", "aunque", "ayer", "añadió", "anadió", "aún", "b", "bajo", "bastante", "bien",
"breve", "buen", "buena", "buenas", "bueno", "buenos", "c", "cada", "casi", "cerca", "cierta",
"ciertas", "cierto", "ciertos", "cinco", "claro", "comentó", "como", "con", "conmigo", "conocer",
"conseguimos", "conseguir", "considera", "consideró", "consigo", "consigue", "consiguen",
"consigues", "contigo", "contra", "cosas", "creo", "cual", "cuales", "cualquier", "cuando",
"cuanta", "cuantas", "cuanto", "cuantos", "cuatro", "cuenta", "cuál", "cuáles", "cuándo", "cuánta",
"cuántas", "cuánto", "cuántos", "cómo", "d", "da", "dado", "dan", "dar", "de", "debajo", "debe",
"deben", "debido", "decir", "dejó", "del", "delante", "demasiado", "demás", "dentro", "deprisa",
"desde", "despacio", "despues", "después", "detras", "detrás", "dia", "dias", "dice", "dicen",
"dicho", "dieron", "diferente", "diferentes", "dijeron", "dijo", "dio", "donde", "dos", "durante",
"día", "días", "dónde", "e", "ejemplo", "el", "ella", "ellas", "ello", "ellos", "embargo",
"empleais", "emplean", "emplear", "empleas", "empleo", "en", "encima", "encuentra", "enfrente",
"enseguida", "entonces", "entre", "era", "eramos", "eran", "eras", "eres", "es", "esa", "esas",
"ese", "eso", "esos", "esta", "estaba", "estaban", "estado", "estados", "estais", "estamos",
"estan", "estar", "estará", "estas", "este", "esto", "estos", "estoy", "estuvo", "está", "están",
"ex", "excepto", "existe", "existen", "explicó", "expresó", "f", "fin", "final", "fue", "fuera",
"fueron", "fui", "fuimos", "g", "general", "gran", "grandes", "gueno", "h", "ha", "haber", "habia",
"habla", "hablan", "habrá", "había", "habían", "hace", "haceis", "hacemos", "hacen", "hacer",
"hacerlo", "haces", "hacia", "haciendo", "hago", "han", "hasta", "hay", "haya", "he", "hecho",
"hemos", "hicieron", "hizo", "horas", "hoy", "hubo", "i", "igual", "incluso", "indicó", "informo",
"informó", "intenta", "intentais", "intentamos", "intentan", "intentar", "intentas", "intento",
"ir", "j", "junto", "k", "l", "la", "lado", "largo", "las", "le", "lejos", "les", "llegó", "lleva",
"llevar", "lo", "los", "luego", "lugar", "m", "mal", "manera", "manifestó", "mas", "mayor", "me",
"mediante", "medio", "mejor", "mencionó", "menos", "menudo", "mi", "mia", "mias", "mientras",
"mio", "mios", "mis", "misma", "mismas", "mismo", "mismos", "modo", "momento", "mucha", "muchas",
"mucho", "muchos", "muy", "más", "mí", "mía", "mías", "mío", "míos", "n", "nada", "nadie", "ni",
"ninguna", "ningunas", "ninguno", "ningunos", "ningún", "no", "nos", "nosotras", "nosotros",
"nuestra", "nuestras", "nuestro", "nuestros", "nueva", "nuevas", "nuevo", "nuevos", "nunca", "o",
"ocho", "os", "otra", "otras", "otro", "otros", "p", "pais", "para", "parece", "parte", "partir",
"pasada", "pasado", "paìs", "peor", "pero", "pesar", "poca", "pocas", "poco", "pocos", "podeis",
"podemos", "poder", "podria", "podriais", "podriamos", "podrian", "podrias", "podrá", "podrán",
"podría", "podrían", "poner", "por", "porque", "posible", "primer", "primera", "primero",
"primeros", "principalmente", "pronto", "propia", "propias", "propio", "propios", "proximo",
"próximo", "próximos", "pudo", "pueda", "puede", "pueden", "puedo", "pues", "q", "qeu", "que",
"quedó", "queremos", "quien", "quienes", "quiere", "quiza", "quizas", "quizá", "quizás", "quién",
"quiénes", "qué", "r", "raras", "realizado", "realizar", "realizó", "repente", "respecto", "s",
"sabe", "sabeis", "sabemos", "saben", "saber", "sabes", "salvo", "se", "sea", "sean", "segun",
"segunda", "segundo", "según", "seis", "ser", "sera", "será", "serán", "sería", "señaló", "senaló",
"si", "sido", "siempre", "siendo", "siete", "sigue", "siguiente", "sin", "sino", "sobre", "sois",
"sola", "solamente", "solas", "solo", "solos", "somos", "son", "soy", "soyos", "su", "supuesto",
"sus", "suya", "suyas", "suyo", "sé", "sí", "sólo", "t", "tal", "tambien", "también", "tampoco",
"tan", "tanto", "tarde", "te", "temprano", "tendrá", "tendrán", "teneis", "tenemos", "tener",
"tenga", "tengo", "tenido", "tenía", "tercera", "ti", "tiempo", "tiene", "tienen", "toda", "todas",
"todavia", "todavía", "todo", "todos", "total", "trabaja", "trabajais", "trabajamos", "trabajan",
"trabajar", "trabajas", "trabajo", "tras", "trata", "través", "tres", "tu", "tus", "tuvo", "tuya",
"tuyas", "tuyo", "tuyos", "tú", "u", "ultimo", "un", "una", "unas", "uno", "unos", "usa", "usais",
"usamos", "usan", "usar", "usas", "uso", "usted", "ustedes", "v", "va", "vais", "valor", "vamos",
"van", "varias", "varios", "vaya", "veces", "ver", "verdad", "verdadera", "verdadero", "vez",
"vosotras", "vosotros", "voy", "vuestra", "vuestras", "vuestro", "vuestros", "w", "x", "y", "ya",
"yo", "z", "él", "ésa", "ésas", "ése", "ésos", "ésta", "éstas", "éste", "éstos", "última",
"últimas", "último", "últimos"]
STOP_WORDS_EN = ["a", "a's", "able", "about", "above", "according", "accordingly", "across", "actually", "after",
"afterwards", "again", "against", "ain't", "all", "allow", "allows", "almost", "alone", "along",
"already", "also", "although", "always", "am", "among", "amongst", "an", "and", "another", "any",
"anybody", "anyhow", "anyone", "anything", "anyway", "anyways", "anywhere", "apart", "appear",
"appreciate", "appropriate", "are", "aren't", "around", "as", "aside", "ask", "asking",
"associated", "at", "available", "away", "awfully", "b", "be", "became", "because", "become",
"becomes", "becoming", "been", "before", "beforehand", "behind", "being", "believe", "below",
"beside", "besides", "best", "better", "between", "beyond", "both", "brief", "but", "by", "c",
"c'mon", "c's", "came", "can", "can't", "cannot", "cant", "cause", "causes", "certain",
"certainly", "changes", "clearly", "co", "com", "come", "comes", "concerning", "consequently",
"consider", "considering", "contain", "containing", "contains", "corresponding", "could",
"couldn't", "course", "currently", "d", "definitely", "described", "despite", "did", "didn't",
"different", "do", "does", "doesn't", "doing", "don't", "done", "down", "downwards", "during", "e",
"each", "edu", "eg", "eight", "either", "else", "elsewhere", "enough", "entirely", "especially",
"et", "etc", "even", "ever", "every", "everybody", "everyone", "everything", "everywhere", "ex",
"exactly", "example", "except", "f", "far", "few", "fifth", "first", "five", "followed",
"following", "follows", "for", "former", "formerly", "forth", "four", "from", "further",
"furthermore", "g", "get", "gets", "getting", "given", "gives", "go", "goes", "going", "gone",
"got", "gotten", "greetings", "h", "had", "hadn't", "happens", "hardly", "has", "hasn't", "have",
"haven't", "having", "he", "he's", "hello", "help", "hence", "her", "here", "here's", "hereafter",
"hereby", "herein", "hereupon", "hers", "herself", "hi", "him", "himself", "his", "hither",
"hopefully", "how", "howbeit", "however", "i", "i'd", "i'll", "i'm", "i've", "ie", "if", "ignored",
"immediate", "in", "inasmuch", "inc", "indeed", "indicate", "indicated", "indicates", "inner",
"insofar", "instead", "into", "inward", "is", "isn't", "it", "it'd", "it'll", "it's", "its",
"itself", "j", "just", "k", "keep", "keeps", "kept", "know", "known", "knows", "l", "last",
"lately", "later", "latter", "latterly", "least", "less", "lest", "let", "let's", "like", "liked",
"likely", "little", "look", "looking", "looks", "ltd", "m", "mainly", "many", "may", "maybe", "me",
"mean", "meanwhile", "merely", "might", "more", "moreover", "most", "mostly", "much", "must", "my",
"myself", "n", "name", "namely", "nd", "near", "nearly", "necessary", "need", "needs", "neither",
"never", "nevertheless", "new", "next", "nine", "no", "nobody", "non", "none", "noone", "nor",
"normally", "not", "nothing", "novel", "now", "nowhere", "o", "obviously", "of", "off", "often",
"oh", "ok", "okay", "old", "on", "once", "one", "ones", "only", "onto", "or", "other", "others",
"otherwise", "ought", "our", "ours", "ourselves", "out", "outside", "over", "overall", "own", "p",
"particular", "particularly", "per", "perhaps", "placed", "please", "plus", "possible",
"presumably", "probably", "provides", "q", "que", "quite", "qv", "r", "rather", "rd", "re",
"really", "reasonably", "regarding", "regardless", "regards", "relatively", "respectively",
"right", "s", "said", "same", "saw", "say", "saying", "says", "second", "secondly", "see",
"seeing", "seem", "seemed", "seeming", "seems", "seen", "self", "selves", "sensible", "sent",
"serious", "seriously", "seven", "several", "shall", "she", "should", "shouldn't", "since", "six",
"so", "some", "somebody", "somehow", "someone", "something", "sometime", "sometimes", "somewhat",
"somewhere", "soon", "sorry", "specified", "specify", "specifying", "still", "sub", "such", "sup",
"sure", "t", "t's", "take", "taken", "tell", "tends", "th", "than", "thank", "thanks", "thanx",
"that", "that's", "thats", "the", "their", "theirs", "them", "themselves", "then", "thence",
"there", "there's", "thereafter", "thereby", "therefore", "therein", "theres", "thereupon",
"these", "they", "they'd", "they'll", "they're", "they've", "think", "third", "this", "thorough",
"thoroughly", "those", "though", "three", "through", "throughout", "thru", "thus", "to",
"together", "too", "took", "toward", "towards", "tried", "tries", "truly", "try", "trying",
"twice", "two", "u", "un", "under", "unfortunately", "unless", "unlikely", "until", "unto", "up",
"upon", "us", "use", "used", "useful", "uses", "using", "usually", "uucp", "v", "value", "various",
"very", "via", "viz", "vs", "w", "want", "wants", "was", "wasn't", "way", "we", "we'd", "we'll",
"we're", "we've", "welcome", "well", "went", "were", "weren't", "what", "what's", "whatever",
"when", "whence", "whenever", "where", "where's", "whereafter", "whereas", "whereby", "wherein",
"whereupon", "wherever", "whether", "which", "while", "whither", "who", "who's", "whoever",
"whole", "whom", "whose", "why", "will", "willing", "wish", "with", "within", "without", "won't",
"wonder", "would", "wouldn't", "x", "y", "yes", "yet", "you", "you'd", "you'll", "you're",
"you've", "your", "yours", "yourself", "yourselves", "z", "zero"]
SPECIAL_CHARACTERS_RE = re.compile("[^A-Za-z]+")
MAIN_WORDS = ["project", "casa", "don", "dios", "hombre", "vida", "gutenbergtm", "senor", "tenia", "ojos", "work",
"padre", "tierra", "mujer", "noche", "mano", "mundo", "anos", "hombres", "habian", "works", "cabeza",
"gutenberg", "cosa", "rey", "pueblo", "amor", "juan", "voz", "hijo", "dona", "manos", "alma",
"nombre", "electronic", "puerta", "madre", "joven", "senora", "paso", "podia", "iba", "ciudad",
"palabras", "corazon", "agua", "gente", "camino", "punto", "muerte", "foundation", "grande", "luz",
"san", "mil", "demas", "pobre", "visto", "terms", "hija", "algun", "amigo", "parecia", "palabra",
"hubiera", "alla", "mar", "calle", "hijos", "cielo", "manana", "cuerpo", "espana", "seaor", "hora",
"ano", "sol", "caso", "agreement", "rio", "vista", "madrid", "historia", "idea", "fuerza", "razon",
"capitan", "vino", "mujeres", "frente", "puesto", "espiritu", "obra", "pie", "duda", "causa", "orden",
"decia", "seria", "states"]
def steps(self):
return [
MRStep(mapper=self.mapper_get_occurrence_for_word_and_doc_name,
reducer=self.reducer_sum_occurrences_for_word_and_doc_name),
MRStep(reducer=self.reducer_sort_doc_names_for_word)
]
# Yields [(word, document name), occurrence] for each word in the line.
def mapper_get_occurrence_for_word_and_doc_name(self, _, line):
# Gets the input file name.
try:
doc_name = os.getenv('mapreduce_map_input_file')
except KeyError:
doc_name = os.getenv('map_input_file')
# In order to yield a pair, the word has to pass the validation filter.
for word in line.split():
try:
# Converts the word into unicode in order to make next transformations.
norm_word = unicode(word, "iso-8859-1")
norm_word = norm_word.lower()
# Verifies that the word until then is not a stop word.
if norm_word not in self.STOP_WORDS_ES and norm_word not in self.STOP_WORDS_EN:
# Normalizes the unicode word to the 'Normal Form Composed' i.e. replaces accent letters with non
# accented ones.
norm_word = unicodedata.normalize('NFD', norm_word).encode('ascii', 'ignore')
# Removes every special character from the normalized word.
norm_word = re.sub(self.SPECIAL_CHARACTERS_RE, '', norm_word)
# Makes possible remaining uppercase letters lowercase ones.
norm_word = norm_word.lower()
# Verifies that the resulting normalized word is not an empty string.
if norm_word != "" and norm_word not in self.STOP_WORDS_ES and norm_word not in self.STOP_WORDS_EN:
if norm_word in self.MAIN_WORDS:
# Yields a key using the word after the filtering.
yield (norm_word, doc_name), 1
except:
# There was a problem filtering the word and it is discarded thus.
None
# Yields [word, (document name, cumulative_occurrences)] for each (word, document_name) key received.
def reducer_sum_occurrences_for_word_and_doc_name(self, word_and_doc_name, occurrences):
word, doc_name = word_and_doc_name
yield word, (doc_name, sum(occurrences))
'''
Prints [word, document name list] for each word after sorting the document name list in descendant order using the
cumulative occurrences of the word as criterion.
'''
def reducer_sort_doc_names_for_word(self, word, doc_name_and_cumulative_occurrences):
# Converts the doc_name_and_cumulative_occurrences (Generator) into a list of tuples.
doc_name_and_cumulative_occurrences_list = []
for doc_name, cumulative_occurrences in doc_name_and_cumulative_occurrences:
doc_name_and_cumulative_occurrences_list.append((doc_name, cumulative_occurrences))
# Sorts the tuple list in descendant order using the cumulative occurrences as criterion.
doc_name_and_cumulative_occurrences_list.sort(key=lambda x: x[1], reverse=True)
# Creates a list containing only the document names after the descendant sorting.
doc_name_list = []
for i in range(0, len(doc_name_and_cumulative_occurrences_list)):
# The document name is at the first position of the tuple.
doc_name_list.append(doc_name_and_cumulative_occurrences_list[i][0])
# Formats the output. ';;;' is selected to separate the word from the list of documents which contain it and
# ';;' is selected as the document name list delimiter to avoid conflicts on posterior parsing.
row = word + ";;;" + ';;'.join(map(str, doc_name_list))
print row
if __name__ == '__main__':
MRDocumentsContainingWord.run()