QNLP  v1.0
tag_file.py
Go to the documentation of this file.
2 import nltk
3 import sys
4 import numpy as np
5 
6 from collections import Counter
7 from nltk.corpus import stopwords
8 sw = stopwords.words('english')
9 
10 def remove_stopwords(text, sw):
11  return [w for w in text if w not in sw]
12 
13 '''
14 Pass the corpus as a string, which is subsequently broken into tokenised sentences.
15 '''
16 def tokenize_corpus(corpus, proc_mode=0):
17  token_sents = nltk.sent_tokenize(corpus_text) #Split on sentences
18  token_words = [] # Individual words
19  tags = [] # Words and respective tags
20 
21  for s in token_sents:
22  tk = nltk.word_tokenize(s)
23  tk = remove_stopwords(tk, stopwords.words('english'))
24  token_words.extend(tk)
25  tags.extend(nltk.pos_tag(tk))
26 
27  if proc_mode != 0:
28  if proc_mode == 's':
29  s = nltk.SnowballStemmer('english', ignore_stopwords=True)
30  token_words = [s.stem(t) for t in token_words]
31  elif proc_mode == 'l':
32  wnl = nltk.WordNetLemmatizer()
33  token_words = [wnl.lemmatize(t) for t in token_words]
34 
35  tags = nltk.pos_tag(token_words)
36 
37  nouns = [i[0] for i in tags if t.matchables(t.Noun, i[1])]
38  verbs = [i[0] for i in tags if t.matchables(t.Verb, i[1])]
39 
40  count_nouns = Counter(nouns)
41  count_verbs = Counter(verbs)
42  return {'verbs':count_verbs, 'nouns':count_nouns, 'tk_sent':token_sents, 'tk_word':token_words}
43 
44 
45 if __name__ == "__main__":
46  corpus_text=""
47 
48  with open(sys.argv[1], 'r') as corpusFile:
49  corpus_text=corpusFile.read()
50 
51  words = tokenize_corpus(corpus_text, proc_mode=0)
52 
53  nvn_space_size = len(words['nouns'])**2 * len(words['verbs'])
54  print ("Nouns count: ", words['nouns'])
55  print ("Verbs count: ", words['verbs'])
56 
57  print ("S Vec meaning space size: ", nvn_space_size)
58  print("Required qubits: ", int(np.ceil(np.log2(nvn_space_size)) ))
59 
60  common_n = words['nouns'].most_common()
61 
62  for s in words['tk_sent']:
63  tokens = nltk.word_tokenize(s)
64  tokens = remove_stopwords(tokens, stopwords.words('english'))
65  tags = nltk.pos_tag(tokens)
66 
67  nouns = [i[0] for i in tags if t.matchables(t.Noun, i[1])]
68  verbs = [i[0] for i in tags if t.matchables(t.Verb, i[1])]
69 
70 
71 
72 
73 
74 
75 
76 
77  for i,v_i in enumerate(common_n[:len(count_nouns)-1]):
78  for j,v_j in enumerate(common_n[i+1:len(count_nouns)]):
79 
80  idx_i = np.array([pos for pos,val in enumerate(tokens) if val == v_i[0]])
81  idx_j = np.array([pos for pos,val in enumerate(tokens) if val == v_j[0]])
82  val=np.Inf
83 
84 
85  for ii in idx_i:
86  for jj in idx_j:
87  tmp=int(np.abs(jj-ii))
88  #print(tmp)
89  if val>tmp:
90  val = tmp
91  #from IPython import embed; embed()
92  if (i != j) and (v_i != v_j) and (val < 2) :
93  print (nouns[i], nouns[j], i, j, val)
94  print(common_n)
95  print(tokens)
def remove_stopwords(text, sw)
Definition: tag_file.py:10
def tokenize_corpus(corpus, proc_mode=0)
Definition: tag_file.py:16