TrainFunctions.py

import nltk

def getCasing(word):  
    """ Returns the casing of a word"""
    if len(word) == 0:
        return 'other'
    elif word.isdigit(): #Is a digit
        return 'numeric'
    elif word.islower(): #All lower case
        return 'allLower'
    elif word.isupper(): #All upper case
        return 'allUpper'
    elif word[0].isupper(): #is a title, initial char upper, then all lower
        return 'initialUpper'
    
    return 'other'


def checkSentenceSanity(sentence):
    """ Checks the sanity of the sentence. If the sentence is for example all uppercase, it is recjected"""
    caseDist = nltk.FreqDist()
    
    for token in sentence:
        caseDist[getCasing(token)] += 1
    
    if caseDist.most_common(1)[0][0] != 'allLower':        
        return False
    
    return True

def updateDistributionsFromSentences(text, wordCasingLookup, uniDist, backwardBiDist, forwardBiDist, trigramDist):
    """
    Updates the NLTK Frequency Distributions based on a list of sentences.
    text: Array of sentences.
    Each sentence must be an array of Tokens.
    """
    # :: Create unigram lookup ::
    for sentence in text:
        if not checkSentenceSanity(sentence):
            continue
        
        for tokenIdx in xrange(1, len(sentence)):
            word = sentence[tokenIdx]
            uniDist[word] += 1
                        
            if word.lower() not in wordCasingLookup:
                wordCasingLookup[word.lower()] = set()
            
            wordCasingLookup[word.lower()].add(word)
            
    
    # :: Create backward + forward bigram lookup ::
    for sentence in text:
        if not checkSentenceSanity(sentence):
            continue
        
        for tokenIdx in xrange(2, len(sentence)): #Start at 2 to skip first word in sentence
            word = sentence[tokenIdx]
            wordLower = word.lower()
            
            if wordLower in wordCasingLookup and len(wordCasingLookup[wordLower]) >= 2: #Only if there are multiple options
                prevWord = sentence[tokenIdx-1]
                
                backwardBiDist[prevWord+"_"+word] +=1
                
                if tokenIdx < len(sentence)-1:
                    nextWord = sentence[tokenIdx+1].lower()
                    forwardBiDist[word+"_"+nextWord] += 1
                    
    # :: Create trigram lookup ::
    for sentence in text:
        if not checkSentenceSanity(sentence):
            continue
        
        for tokenIdx in xrange(2, len(sentence)-1): #Start at 2 to skip first word in sentence
            prevWord = sentence[tokenIdx-1]
            curWord = sentence[tokenIdx]
            curWordLower = curWord.lower()
            nextWordLower = sentence[tokenIdx+1].lower()
            
            if curWordLower in wordCasingLookup and len(wordCasingLookup[curWordLower]) >= 2: #Only if there are multiple options   
                trigramDist[prevWord+"_"+curWord+"_"+nextWordLower] += 1
    
            
def updateDistributionsFromNgrams(bigramFile, trigramFile, wordCasingLookup, uniDist, backwardBiDist, forwardBiDist, trigramDist):
    """
    Updates the FrequencyDistribitions based on an ngram file,
    e.g. the ngram file of http://www.ngrams.info/download_coca.asp
    """
    for line in open(bigramFile):
        splits = line.strip().split('\t')
        cnt, word1, word2 = splits
        cnt = int(cnt)
        
        # Unigram
        if word1.lower() not in wordCasingLookup:
            wordCasingLookup[word1.lower()] = set()
            
        wordCasingLookup[word1.lower()].add(word1)
        
        if word2.lower() not in wordCasingLookup:
            wordCasingLookup[word2.lower()] = set()
            
        wordCasingLookup[word2.lower()].add(word2)
        
        
        uniDist[word1] += cnt
        uniDist[word2] += cnt
        
        # Bigrams
        backwardBiDist[word1+"_"+word2] +=cnt
        forwardBiDist[word1+"_"+word2.lower()] += cnt
        
        
    #Tigrams
    for line in open(trigramFile):
        splits = line.strip().split('\t')
        cnt, word1, word2, word3 = splits
        cnt = int(cnt)
        
        trigramDist[word1+"_"+word2+"_"+word3.lower()] += cnt