-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathstep2.py
68 lines (59 loc) · 2.47 KB
/
step2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# 使用spacy作为训练提取出专用词汇
import spacy
from spacy import displacy
from spacy.pipeline import EntityRecognizer
def pre_process_en(corpora=""):
"""
# 英文
# 1、分词
# 2、去掉停用词
# 3、去掉标点符号
# 4、处理为词干
# 5、去除低频词
a proprocess on courrses in English version
:param corpora:
:type [str,str,...]
:param low_freq_filter: whether to filter low-frequence word
:return: text
:type [[],[],...]
"""
# Text: The original word text.
# Lemma: The base form of the word.
# POS: The simple part-of-speech tag.
# Tag: The detailed part-of-speech tag.
# Dep: Syntactic dependency, i.e. the relation between tokens.
# Shape: The word shape – capitalisation, punctuation, digits.
# is alpha: Is the token an alpha character?
# is stop: Is the token part of a stop list, i.e. the most common words of the language?
nlp = spacy.load('en_core_web_sm')
text = "The present invention relates to the use of gallium beam lithography to form superconductive structures. Generally, the method includes exposing a surface to gallium to form an implanted region and then removing material adjacent to and/or below that implanted region. In particular embodiments, the methods herein provide microstructures and nanostructures in any useful substrate, such as those including niobium, tantalum, tungsten, or titanium."
doc = nlp(text)
# 文档分析
print("doc")
for token in doc:
print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
token.shape_, token.is_alpha, token.is_stop)
# 命名实体抽取
print("doc.ents")
for ent in doc.ents:
print(ent.text, ent.start_char, ent.end_char, ent.label_)
# 名词短语
print("doc.noun_chunks")
for np in doc.noun_chunks:
print(np.text)
# print(np.text, np.root.dep_, np.root.head.text)
# displacy.serve(doc, style='ent')
# pre_process_en()
nlp = spacy.load('en_core_web_sm')
with open(r"Data\RawData\corpus_en.txt", 'r', encoding="UTF8") as en_Reader,\
open(r"Data\Other\word_en.txt", 'w', encoding="UTF8") as Writer:
texts_en = [line.strip() for line in en_Reader.readlines()]
i = 1
for document in texts_en:
print(i)
Writer.write("第{0}篇:\n".format(i))
i+=1
doc = nlp(document)
for np in doc.noun_chunks:
print(np.text)
Writer.write(np.text+"\n")