Merge pull request #26 from 18kimn/size-adjustment

adjustments necessary to run `open_ended_coders.py` outside of Docker
MPEDS · Nov 8, 2024 · 340f06e · 340f06e
2 parents 4fec922 + 008806c
commit 340f06e
Showing 1 changed file with 22 additions and 27 deletions.
diff --git a/mpeds/mpeds/open_ended_coders.py b/mpeds/mpeds/open_ended_coders.py
@@ -4,12 +4,13 @@
 import re
 
 import urllib
-import urllib2
+import urllib.parse
+import urllib.request
 import json
 
 import os
 
-from sklearn.feature_extraction import stop_words
+from sklearn.feature_extraction import _stop_words
 from nltk.tag import StanfordNERTagger
 from nltk.tokenize.stanford import StanfordTokenizer
 from pkg_resources import resource_filename
@@ -48,7 +49,7 @@ def getSize(self, text, as_str = False, verbose = False):
 
         '''
 
-        text = text.decode('utf-8')
+        # text = text.decode('utf-8')
 
         if not self.RE:
             self._loadRegexPatterns()
@@ -66,7 +67,7 @@ def getSize(self, text, as_str = False, verbose = False):
         for s in sentences:
 
             if verbose:
-                print '\nPROCESSING SENTENCE: ' + s
+                print('\nPROCESSING SENTENCE: ' + s)
 
             ## hack to stop the issue with "tens"
             s = re.sub("tens of thousands", "10,000", s, flags = re.I)
@@ -85,7 +86,7 @@ def getSize(self, text, as_str = False, verbose = False):
             for i in range(0, i_end):
 
                 if verbose:
-                    print 'At token: ' + tokens[i]
+                    print('At token: ' + tokens[i])
 
                 loc += len(tokens[i]) + 1
                 size = None
@@ -109,35 +110,29 @@ def getSize(self, text, as_str = False, verbose = False):
                 for j in range(r_start, r_context):
 
                     if verbose:
-                        print '     ' + tokens[j]
+                        print('     ' + tokens[j])
 
                     if self.RE['NUMBERS'].search(tokens[j]) and j - i < 3:
                         ## skip things which will be coded in the next pass
                         ## e.g. tens of thousands or two dozen
 
                         if verbose:
-                            print '     -> Detected number, skipping ahead'
+                            print('     -> Detected number, skipping ahead')
 
                         break
 
                     elif not self.RE['NVERBS'].search(' '.join(tokens[i:])):
                         ## filter out all verbs we don't want
-                        if tokens[j] in self.P_SUBJ['protest']:
-                            ## if there is a protest subj, use that
+                        size = tokens[i]
 
-                            size = tokens[i]
-
-                            if verbose:
-                                print '     -> Detected protest subject, setting size to ' + size
-
-                        elif (self.RE['SUBJ'].search(tokens[j]) or self.RE['ETHNIC'].search(tokens[j])) \
+                        if (self.RE['SUBJ'].search(tokens[j]) or self.RE['ETHNIC'].search(tokens[j])) \
                             and self.RE['VERBS'].search(' '.join(tokens[i:])):
                             ## if not, test for a protest verb
 
                             size = tokens[i]
 
                             if verbose:
-                                print '     -> Detected protest verb, setting size to ' + size
+                                print('     -> Detected protest verb, setting size to ' + size)
 
 
 
@@ -146,7 +141,7 @@ def getSize(self, text, as_str = False, verbose = False):
                 for j in range(l_context, i):
 
                     if verbose:
-                        print '     ' + tokens[j]
+                        print('     ' + tokens[j])
 
                     #if RE_GROUPS.search(tokens[j]) and RE_VERBS.search(' '.join(tokens[i:])) and not size:
                     if not size:
@@ -156,7 +151,7 @@ def getSize(self, text, as_str = False, verbose = False):
                             size = tokens[i]
 
                             if verbose:
-                                print '     -> Detected protest group or verb, setting size to ' + size
+                                print('     -> Detected protest group or verb, setting size to ' + size)
 
 
 
@@ -165,19 +160,19 @@ def getSize(self, text, as_str = False, verbose = False):
                         size = '-'.join([tokens[j], size])
 
                         if verbose:
-                            print '     -> Detected pre-number, setting size to ' + size
+                            print('     -> Detected pre-number, setting size to ' + size)
 
                         if len(sizes) > 0 and self._strToNum(tokens[j]) == sizes[len(sizes) - 1]:
 
                             sizes = sizes[1:(len(sizes) - 1)]
 
                             if verbose:
-                                print '-> Pre-number added to sizes at last iteration, removing it'
+                                print('-> Pre-number added to sizes at last iteration, removing it')
 
                 if size:
                     ## parse and append
                     if verbose:
-                        print '-> Adding ' + str(self._strToNum(size)) + ' to sizes'
+                        print('-> Adding ' + str(self._strToNum(size)) + ' to sizes')
 
                     sizes.append(self._strToNum(size))
 
@@ -233,7 +228,7 @@ def _loadRegexPatterns(self):
             self._loadNumberMapping()
 
         # remove tens, which is almost never used by itself
-        number_set = self.NUM_MAP.keys()
+        number_set = list(self.NUM_MAP.keys())
         number_set.remove('tens')
 
         S_LESS10  = r'one|two|three|four|five|six|seven|eight|nine'
@@ -264,14 +259,14 @@ def _loadSpecialWords(self):
         self.S_PREFIX  = ['around', 'up to', 'as many as', 'some', 'many', 'nearly', 'more than', 'about']
 
         self.P_SUBJ   = {
-            'protest': ['protesters', 'protestors', 'demonstrators', 'activists', 'strikers', 'marchers', 'signatures',
+            'protest': ['protesters', 'protestors', 'students', 'people','demonstrators', 'activists', 'strikers', 'marchers', 'signatures',
             	'counter-demonstrators', 'counter-demonstraters', 'counter-protesters', 'counter-protestors', 'counterprotesters',
             	'counterprotestors']
                 }
 
         self.AGW = ['Agence France-Presse, English Service', 'Associated Press Worldstream, English Service']
 
-        self.SWS = list(stop_words.ENGLISH_STOP_WORDS)
+        self.SWS = list(_stop_words.ENGLISH_STOP_WORDS)
 
 
     def _loadNumberMapping(self):
@@ -412,7 +407,7 @@ def _urlencode_utf8(self, params):
         if hasattr(params, 'items'):
             params = params.items()
         return '&'.join(
-            (urllib.quote_plus(k.encode('utf8'), safe='/') + '=' + urllib.quote_plus(v.encode('utf8'), safe='/')
+            (urllib.parse.quote_plus(k.encode('utf8'), safe='/') + '=' + urllib.parse.quote_plus(v.encode('utf8'), safe='/')
                 for k, v in params) )
 
 
@@ -435,8 +430,8 @@ def _getCLIFF(self, text):
 
         while obj is None:
             url = 'http://%s/parse/text' % self.cliff_url
-            req = urllib2.Request(url, data)
-            res = urllib2.urlopen(req)
+            req = urllib.request.Request(url, data.encode('utf-8'))
+            res = urllib.request.urlopen(req)
             obj = json.loads(res.read())
 
             if obj is not None: