-
Notifications
You must be signed in to change notification settings - Fork 1
/
syntactic_analysis.py
66 lines (54 loc) · 1.84 KB
/
syntactic_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import spacy
import string
from collections import Counter
nlp = spacy.load("en_core_web_sm")
def analyze_sentence_length(text):
"""
tokenizes into sentences and returns list of lengths
"""
doc = nlp(text)
sentence_lengths = [len(sent) for sent in doc.sents]
return sentence_lengths
def analyze_words(text):
"""
tokenizes into words and returns the vocabulary size
"""
doc = nlp(text)
tokens = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct and not token.is_space]
vocabulary_size = len(set(tokens))
return vocabulary_size
def analyze_syntax(text):
"""
tokenizes the given text into words
applies POS tagging
returns a dictionary containing frequency count of each tag
"""
doc = nlp(text)
tag_counts = Counter([token.pos_ for token in doc if not token.is_stop and not token.is_punct and not token.is_space])
return tag_counts
def analyze_caps(text):
"""
tokenizes into words and identifies capitalized words
"""
doc = nlp(text)
capitalized_words = [token.text for token in doc if token.text.isupper() and not token.is_stop and not token.is_punct and not token.is_space]
return capitalized_words
def word_freqs(text):
"""
tokenizes into words and counts the frequency of each word
"""
doc = nlp(text)
words = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct and not token.is_space]
word_frequencies = dict(Counter(words))
return word_frequencies
def analyze(text):
"""
returns everything
"""
return {
"sentence_lengths": analyze_sentence_length(text),
"vocabulary_size": analyze_words(text),
"pos_tag_counts": analyze_syntax(text),
"capitalized_words": analyze_caps(text),
"word_frequencies": word_freqs(text)
}