6 from collections
import Counter
7 from nltk.corpus
import stopwords
8 sw = stopwords.words(
'english')
11 return [w
for w
in text
if w
not in sw]
14 Pass the corpus as a string, which is subsequently broken into tokenised sentences. 17 token_sents = nltk.sent_tokenize(corpus_text)
22 tk = nltk.word_tokenize(s)
24 token_words.extend(tk)
25 tags.extend(nltk.pos_tag(tk))
29 s = nltk.SnowballStemmer(
'english', ignore_stopwords=
True)
30 token_words = [s.stem(t)
for t
in token_words]
31 elif proc_mode ==
'l':
32 wnl = nltk.WordNetLemmatizer()
33 token_words = [wnl.lemmatize(t)
for t
in token_words]
35 tags = nltk.pos_tag(token_words)
37 nouns = [i[0]
for i
in tags
if t.matchables(t.Noun, i[1])]
38 verbs = [i[0]
for i
in tags
if t.matchables(t.Verb, i[1])]
40 count_nouns = Counter(nouns)
41 count_verbs = Counter(verbs)
42 return {
'verbs':count_verbs,
'nouns':count_nouns,
'tk_sent':token_sents,
'tk_word':token_words}
45 if __name__ ==
"__main__":
48 with open(sys.argv[1],
'r')
as corpusFile:
49 corpus_text=corpusFile.read()
53 nvn_space_size = len(words[
'nouns'])**2 * len(words[
'verbs'])
54 print (
"Nouns count: ", words[
'nouns'])
55 print (
"Verbs count: ", words[
'verbs'])
57 print (
"S Vec meaning space size: ", nvn_space_size)
58 print(
"Required qubits: ", int(np.ceil(np.log2(nvn_space_size)) ))
60 common_n = words[
'nouns'].most_common()
62 for s
in words[
'tk_sent']:
63 tokens = nltk.word_tokenize(s)
65 tags = nltk.pos_tag(tokens)
67 nouns = [i[0]
for i
in tags
if t.matchables(t.Noun, i[1])]
68 verbs = [i[0]
for i
in tags
if t.matchables(t.Verb, i[1])]
77 for i,v_i
in enumerate(common_n[:len(count_nouns)-1]):
78 for j,v_j
in enumerate(common_n[i+1:len(count_nouns)]):
80 idx_i = np.array([pos
for pos,val
in enumerate(tokens)
if val == v_i[0]])
81 idx_j = np.array([pos
for pos,val
in enumerate(tokens)
if val == v_j[0]])
87 tmp=int(np.abs(jj-ii))
92 if (i != j)
and (v_i != v_j)
and (val < 2) :
93 print (nouns[i], nouns[j], i, j, val)
def remove_stopwords(text, sw)
def tokenize_corpus(corpus, proc_mode=0)