Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

adjustments necessary to run open_ended_coders.py outside of Docker #26

Merged
merged 1 commit into from
Nov 8, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 22 additions & 27 deletions mpeds/mpeds/open_ended_coders.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,13 @@
import re

import urllib
import urllib2
import urllib.parse
import urllib.request
import json

import os

from sklearn.feature_extraction import stop_words
from sklearn.feature_extraction import _stop_words
from nltk.tag import StanfordNERTagger
from nltk.tokenize.stanford import StanfordTokenizer
from pkg_resources import resource_filename
Expand Down Expand Up @@ -48,7 +49,7 @@ def getSize(self, text, as_str = False, verbose = False):

'''

text = text.decode('utf-8')
# text = text.decode('utf-8')

if not self.RE:
self._loadRegexPatterns()
Expand All @@ -66,7 +67,7 @@ def getSize(self, text, as_str = False, verbose = False):
for s in sentences:

if verbose:
print '\nPROCESSING SENTENCE: ' + s
print('\nPROCESSING SENTENCE: ' + s)

## hack to stop the issue with "tens"
s = re.sub("tens of thousands", "10,000", s, flags = re.I)
Expand All @@ -85,7 +86,7 @@ def getSize(self, text, as_str = False, verbose = False):
for i in range(0, i_end):

if verbose:
print 'At token: ' + tokens[i]
print('At token: ' + tokens[i])

loc += len(tokens[i]) + 1
size = None
Expand All @@ -109,35 +110,29 @@ def getSize(self, text, as_str = False, verbose = False):
for j in range(r_start, r_context):

if verbose:
print ' ' + tokens[j]
print(' ' + tokens[j])

if self.RE['NUMBERS'].search(tokens[j]) and j - i < 3:
## skip things which will be coded in the next pass
## e.g. tens of thousands or two dozen

if verbose:
print ' -> Detected number, skipping ahead'
print(' -> Detected number, skipping ahead')

break

elif not self.RE['NVERBS'].search(' '.join(tokens[i:])):
## filter out all verbs we don't want
if tokens[j] in self.P_SUBJ['protest']:
## if there is a protest subj, use that
size = tokens[i]

size = tokens[i]

if verbose:
print ' -> Detected protest subject, setting size to ' + size

elif (self.RE['SUBJ'].search(tokens[j]) or self.RE['ETHNIC'].search(tokens[j])) \
if (self.RE['SUBJ'].search(tokens[j]) or self.RE['ETHNIC'].search(tokens[j])) \
and self.RE['VERBS'].search(' '.join(tokens[i:])):
## if not, test for a protest verb

size = tokens[i]

if verbose:
print ' -> Detected protest verb, setting size to ' + size
print(' -> Detected protest verb, setting size to ' + size)



Expand All @@ -146,7 +141,7 @@ def getSize(self, text, as_str = False, verbose = False):
for j in range(l_context, i):

if verbose:
print ' ' + tokens[j]
print(' ' + tokens[j])

#if RE_GROUPS.search(tokens[j]) and RE_VERBS.search(' '.join(tokens[i:])) and not size:
if not size:
Expand All @@ -156,7 +151,7 @@ def getSize(self, text, as_str = False, verbose = False):
size = tokens[i]

if verbose:
print ' -> Detected protest group or verb, setting size to ' + size
print(' -> Detected protest group or verb, setting size to ' + size)



Expand All @@ -165,19 +160,19 @@ def getSize(self, text, as_str = False, verbose = False):
size = '-'.join([tokens[j], size])

if verbose:
print ' -> Detected pre-number, setting size to ' + size
print(' -> Detected pre-number, setting size to ' + size)

if len(sizes) > 0 and self._strToNum(tokens[j]) == sizes[len(sizes) - 1]:

sizes = sizes[1:(len(sizes) - 1)]

if verbose:
print '-> Pre-number added to sizes at last iteration, removing it'
print('-> Pre-number added to sizes at last iteration, removing it')

if size:
## parse and append
if verbose:
print '-> Adding ' + str(self._strToNum(size)) + ' to sizes'
print('-> Adding ' + str(self._strToNum(size)) + ' to sizes')

sizes.append(self._strToNum(size))

Expand Down Expand Up @@ -233,7 +228,7 @@ def _loadRegexPatterns(self):
self._loadNumberMapping()

# remove tens, which is almost never used by itself
number_set = self.NUM_MAP.keys()
number_set = list(self.NUM_MAP.keys())
number_set.remove('tens')

S_LESS10 = r'one|two|three|four|five|six|seven|eight|nine'
Expand Down Expand Up @@ -264,14 +259,14 @@ def _loadSpecialWords(self):
self.S_PREFIX = ['around', 'up to', 'as many as', 'some', 'many', 'nearly', 'more than', 'about']

self.P_SUBJ = {
'protest': ['protesters', 'protestors', 'demonstrators', 'activists', 'strikers', 'marchers', 'signatures',
'protest': ['protesters', 'protestors', 'students', 'people','demonstrators', 'activists', 'strikers', 'marchers', 'signatures',
'counter-demonstrators', 'counter-demonstraters', 'counter-protesters', 'counter-protestors', 'counterprotesters',
'counterprotestors']
}

self.AGW = ['Agence France-Presse, English Service', 'Associated Press Worldstream, English Service']

self.SWS = list(stop_words.ENGLISH_STOP_WORDS)
self.SWS = list(_stop_words.ENGLISH_STOP_WORDS)


def _loadNumberMapping(self):
Expand Down Expand Up @@ -412,7 +407,7 @@ def _urlencode_utf8(self, params):
if hasattr(params, 'items'):
params = params.items()
return '&'.join(
(urllib.quote_plus(k.encode('utf8'), safe='/') + '=' + urllib.quote_plus(v.encode('utf8'), safe='/')
(urllib.parse.quote_plus(k.encode('utf8'), safe='/') + '=' + urllib.parse.quote_plus(v.encode('utf8'), safe='/')
for k, v in params) )


Expand All @@ -435,8 +430,8 @@ def _getCLIFF(self, text):

while obj is None:
url = 'http://%s/parse/text' % self.cliff_url
req = urllib2.Request(url, data)
res = urllib2.urlopen(req)
req = urllib.request.Request(url, data.encode('utf-8'))
res = urllib.request.urlopen(req)
obj = json.loads(res.read())

if obj is not None:
Expand Down