Skip to content

Commit

Permalink
Merge pull request #26 from 18kimn/size-adjustment
Browse files Browse the repository at this point in the history
adjustments necessary to run `open_ended_coders.py` outside of Docker
  • Loading branch information
alexhanna authored Nov 8, 2024
2 parents 4fec922 + 008806c commit 340f06e
Showing 1 changed file with 22 additions and 27 deletions.
49 changes: 22 additions & 27 deletions mpeds/mpeds/open_ended_coders.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,13 @@
import re

import urllib
import urllib2
import urllib.parse
import urllib.request
import json

import os

from sklearn.feature_extraction import stop_words
from sklearn.feature_extraction import _stop_words
from nltk.tag import StanfordNERTagger
from nltk.tokenize.stanford import StanfordTokenizer
from pkg_resources import resource_filename
Expand Down Expand Up @@ -48,7 +49,7 @@ def getSize(self, text, as_str = False, verbose = False):
'''

text = text.decode('utf-8')
# text = text.decode('utf-8')

if not self.RE:
self._loadRegexPatterns()
Expand All @@ -66,7 +67,7 @@ def getSize(self, text, as_str = False, verbose = False):
for s in sentences:

if verbose:
print '\nPROCESSING SENTENCE: ' + s
print('\nPROCESSING SENTENCE: ' + s)

## hack to stop the issue with "tens"
s = re.sub("tens of thousands", "10,000", s, flags = re.I)
Expand All @@ -85,7 +86,7 @@ def getSize(self, text, as_str = False, verbose = False):
for i in range(0, i_end):

if verbose:
print 'At token: ' + tokens[i]
print('At token: ' + tokens[i])

loc += len(tokens[i]) + 1
size = None
Expand All @@ -109,35 +110,29 @@ def getSize(self, text, as_str = False, verbose = False):
for j in range(r_start, r_context):

if verbose:
print ' ' + tokens[j]
print(' ' + tokens[j])

if self.RE['NUMBERS'].search(tokens[j]) and j - i < 3:
## skip things which will be coded in the next pass
## e.g. tens of thousands or two dozen

if verbose:
print ' -> Detected number, skipping ahead'
print(' -> Detected number, skipping ahead')

break

elif not self.RE['NVERBS'].search(' '.join(tokens[i:])):
## filter out all verbs we don't want
if tokens[j] in self.P_SUBJ['protest']:
## if there is a protest subj, use that
size = tokens[i]

size = tokens[i]

if verbose:
print ' -> Detected protest subject, setting size to ' + size

elif (self.RE['SUBJ'].search(tokens[j]) or self.RE['ETHNIC'].search(tokens[j])) \
if (self.RE['SUBJ'].search(tokens[j]) or self.RE['ETHNIC'].search(tokens[j])) \
and self.RE['VERBS'].search(' '.join(tokens[i:])):
## if not, test for a protest verb

size = tokens[i]

if verbose:
print ' -> Detected protest verb, setting size to ' + size
print(' -> Detected protest verb, setting size to ' + size)



Expand All @@ -146,7 +141,7 @@ def getSize(self, text, as_str = False, verbose = False):
for j in range(l_context, i):

if verbose:
print ' ' + tokens[j]
print(' ' + tokens[j])

#if RE_GROUPS.search(tokens[j]) and RE_VERBS.search(' '.join(tokens[i:])) and not size:
if not size:
Expand All @@ -156,7 +151,7 @@ def getSize(self, text, as_str = False, verbose = False):
size = tokens[i]

if verbose:
print ' -> Detected protest group or verb, setting size to ' + size
print(' -> Detected protest group or verb, setting size to ' + size)



Expand All @@ -165,19 +160,19 @@ def getSize(self, text, as_str = False, verbose = False):
size = '-'.join([tokens[j], size])

if verbose:
print ' -> Detected pre-number, setting size to ' + size
print(' -> Detected pre-number, setting size to ' + size)

if len(sizes) > 0 and self._strToNum(tokens[j]) == sizes[len(sizes) - 1]:

sizes = sizes[1:(len(sizes) - 1)]

if verbose:
print '-> Pre-number added to sizes at last iteration, removing it'
print('-> Pre-number added to sizes at last iteration, removing it')

if size:
## parse and append
if verbose:
print '-> Adding ' + str(self._strToNum(size)) + ' to sizes'
print('-> Adding ' + str(self._strToNum(size)) + ' to sizes')

sizes.append(self._strToNum(size))

Expand Down Expand Up @@ -233,7 +228,7 @@ def _loadRegexPatterns(self):
self._loadNumberMapping()

# remove tens, which is almost never used by itself
number_set = self.NUM_MAP.keys()
number_set = list(self.NUM_MAP.keys())
number_set.remove('tens')

S_LESS10 = r'one|two|three|four|five|six|seven|eight|nine'
Expand Down Expand Up @@ -264,14 +259,14 @@ def _loadSpecialWords(self):
self.S_PREFIX = ['around', 'up to', 'as many as', 'some', 'many', 'nearly', 'more than', 'about']

self.P_SUBJ = {
'protest': ['protesters', 'protestors', 'demonstrators', 'activists', 'strikers', 'marchers', 'signatures',
'protest': ['protesters', 'protestors', 'students', 'people','demonstrators', 'activists', 'strikers', 'marchers', 'signatures',
'counter-demonstrators', 'counter-demonstraters', 'counter-protesters', 'counter-protestors', 'counterprotesters',
'counterprotestors']
}

self.AGW = ['Agence France-Presse, English Service', 'Associated Press Worldstream, English Service']

self.SWS = list(stop_words.ENGLISH_STOP_WORDS)
self.SWS = list(_stop_words.ENGLISH_STOP_WORDS)


def _loadNumberMapping(self):
Expand Down Expand Up @@ -412,7 +407,7 @@ def _urlencode_utf8(self, params):
if hasattr(params, 'items'):
params = params.items()
return '&'.join(
(urllib.quote_plus(k.encode('utf8'), safe='/') + '=' + urllib.quote_plus(v.encode('utf8'), safe='/')
(urllib.parse.quote_plus(k.encode('utf8'), safe='/') + '=' + urllib.parse.quote_plus(v.encode('utf8'), safe='/')
for k, v in params) )


Expand All @@ -435,8 +430,8 @@ def _getCLIFF(self, text):

while obj is None:
url = 'http://%s/parse/text' % self.cliff_url
req = urllib2.Request(url, data)
res = urllib2.urlopen(req)
req = urllib.request.Request(url, data.encode('utf-8'))
res = urllib.request.urlopen(req)
obj = json.loads(res.read())

if obj is not None:
Expand Down

0 comments on commit 340f06e

Please sign in to comment.