QNLP  v1.0
process_corpus.py
Go to the documentation of this file.
1 # Convert corpus to non-zero state coefficients for quantum encoding
2 
3 import QNLP.tagging as tg
4 import os
5 
6 import nltk
7 import sys
8 import numpy as np
9 
10 # DB for exporting data to be read into C++
11 import QNLP.io.qnlp_db as qdb
12 
13 from collections import Counter
14 from nltk.corpus import stopwords
15 sw = stopwords.words('english')
16 
17 
18 
19 def remove_stopwords(text, sw):
20  """Remove words that do not add to the meaning; simplifies sentences"""
21  return [w for w in text if w not in sw]
22 
23 
24 
25 def tokenize_corpus(corpus, proc_mode=0, stop_words=True):
26  """
27  Pass the corpus as a string, which is subsequently broken into tokenized sentences, and returned as dictionary of verbs, nouns, tokenized words, and tokenized sentences.
28 
29  Keyword arguments:
30  corpus -- string representing the corpus to tokenize
31  proc_mode -- defines the processing mode. Lemmatization: proc_mode=\"l\"; Stemming: proc_mode=\"s\"; No additional processing: proc_mode=0 (default=0)
32  stop_words -- indicates whether stop words should be removed (False) or kept (True) (from nltk.corpus.stopwords.words(\"english\"))
33  """
34 
35  token_sents = nltk.sent_tokenize(corpus) #Split on sentences
36  token_words = [] # Individual words
37  tags = [] # Words and respective tags
38 
39  for s in token_sents:
40  tk = nltk.word_tokenize(s)
41  if stop_words == False:
42  tk = remove_stopwords(tk, sw)
43  token_words.extend(tk)
44  tags.extend(nltk.pos_tag(tk))
45 
46  if proc_mode != 0:
47  if proc_mode == 's':
48  s = nltk.SnowballStemmer('english', ignore_stopwords = not stop_words)
49  token_words = [s.stem(t) for t in token_words]
50  elif proc_mode == 'l':
51  wnl = nltk.WordNetLemmatizer()
52  token_words = [wnl.lemmatize(t) for t in token_words]
53 
54  tags = nltk.pos_tag(token_words)
55  nouns = [i[0] for i in tags if tg.matchables(tg.Noun, i[1])]
56  verbs = [i[0] for i in tags if tg.matchables(tg.Verb, i[1])]
57 
58  count_nouns = Counter(nouns)
59  count_verbs = Counter(verbs)
60 
61  return {'verbs':count_verbs, 'nouns':count_nouns, 'tk_sentence':token_sents, 'tk_words':token_words}
62 
63 
64 
65 def load_corpus(corpus_path, proc_mode=0):
66  """Load the corpus from disk."""
67  corpus_text=""
68  with open(corpus_path, 'r') as corpusFile:
69  corpus_text=corpusFile.read()
70  return corpus_text
71 
72 def process(corpus_path, proc_mode=0):
73  """Tokenize the corpus data."""
74  words = tokenize_corpus(load_corpus(corpus_path), proc_mode=proc_mode)
75  return words
76 
77 
78 
79 def word_pairing(words, window_size):
80  """
81  Examine the words around each verb with the specified window size, and attempt to match the NVN pattern. The window_size specifies the number of values around each verb to search for the matching nouns. If passed as tuple
82  (l,r) gives the left and right windows separately. If passed as a scalar, both values are equal.
83 
84  Keyword arguments:
85  words -- list of the tokenized words
86  window_size -- window to search for word pairings. Tuple
87  """
88 
89  #Naively set sentence boundary at location of full-stops.
90  sentence_boundary = set([i for i, x in enumerate(words['tk_words']) if "." in x])
91 
92  if type(window_size) is tuple:
93  window_left, window_right = window_size
94  else:
95  window_left = window_size
96  window_right = window_size
97  verb_idx={}
98 
99  for v in words['verbs']:
100  verb_idx.update({v:[i for i, x in enumerate(words['tk_words']) if v in x]})
101 
102  for n in words['nouns']:
103  for v,k in verb_idx.items():
104  for v_idx in k:
105  #Ensure that no full stop appears over the windowed region to avoid crossing sentences.
106  l_left = len(sentence_boundary.intersection(range(v_idx-window_left,v_idx)))
107  l_right = len(sentence_boundary.intersection(range(v_idx,v_idx+window_right)))
108 
109  if l_left==0 and n in words['tk_words'][v_idx-window_left:v_idx]:
110  print("LEFT:=",n,v)
111  if l_right==0 and (v_idx + window_right) < len(words['tk_words']) and n in words['tk_words'][v_idx:v_idx+window_right] :
112  print("RIGHT:=",v,n)
113 
114 
115 
116 def num_qubits(words):
117  """
118  Calculate the required number of qubits to encode the NVN state-space of the corpus.
119  Returns tuple of required states and number of needed qubits.
120  While it is possible to reduce qubit count by considering the total required number as being the product of each combo, it is easier to deal with separate states for nouns and verbs, with at most 1 extra qubit required.
121 
122  Keyword arguments:
123  words -- list of the tokenized words
124  """
125  nvn_space_size = len(words['nouns'])**2 * len(words['verbs'])
126  req_qubits_n = int(np.ceil(np.log2(len(words['nouns']))))
127  req_qubits_v = int(np.ceil(np.log2(len(words['verbs']))))
128 
129  #req_qubits = int(np.ceil(np.log2(nvn_space_size)))
130  req_qubits = req_qubits_n*2 + req_qubits_v
131 
132  print("Unique states:",nvn_space_size,"\tRequired qubits total:", req_qubits, "\tRequired qubits nouns:", req_qubits_n, "\tRequired qubits verbs:", req_qubits_v)
133  return (nvn_space_size, req_qubits, req_qubits_n, req_qubits_v)
134 
135 def mapNameToBinaryBasis(words, db_name, table_name="qnlp"):
136  """
137  Maps the unique string in each respective space to a binary basis number for qubit representation.
138 
139  Keyword arguments:
140  words -- list of the tokenized words
141  db_name -- name of the database file
142  table_name -- name of the table to store data in db_name
143  """
144 
145  _, num_q_total, num_q_n, num_q_v = num_qubits(words)
146 
147  verbMap = {}
148  nounMap = {}
149  for i,v in enumerate(words['nouns']):
150  #Perform two-way mapping for bin to value and value to bin
151  nounMap.update({v:format(i,'b').zfill(num_q_n)})
152  nounMap.update({format(i,'b').zfill(num_q_n):v})
153 
154  for i,v in enumerate(words['verbs']):
155  verbMap.update({v:format(i,'b').zfill(num_q_v)})
156  verbMap.update({format(i,'b').zfill(num_q_v):v})
157 
158  db = qdb.qnlp_db(db_name, ".")
159  db.db_insert(nounMap, "noun", table_name)
160  db.db_insert(verbMap, "verb", table_name)
161 
162  return (nounMap, verbMap)
163 
164 
165 
166 
167 def run(BasisPath, CorpusPath, proc_mode=0, db_name="qnlp_tagged_corpus"):
168  # Load the basis words
169  basis_text=""
170  # Load the corpus
171  corpus_text=""
172 
173  with open(BasisPath, 'r') as basisFile:
174  basis_text=basisFile.read()
175 
176  with open(CorpusPath, 'r') as corpusFile:
177  corpus_text=corpusFile.read()
178 
179  basis_words = tokenize_corpus(basis_text, proc_mode=proc_mode)
180  corpus_words = tokenize_corpus(corpus_text, proc_mode=proc_mode)
181 
182  #Naively set sentence boundary at location of full-stops.
183  sentence_boundary = set([i for i, x in enumerate(corpus_words['tk_words']) if "." in x])
184 
185  # examine word windowing for proximity
186  #window = (2,3)
187  window=1
188  word_pairing(basis_words, window)
189 
190  # Determine qubit requirements
191  _, num_q_total, num_q_n, num_q_v = num_qubits(basis_words)
192 
193  #Map basis words to binary strings, and populate DB
194  basis_nMap, basis_vMap = mapNameToBinaryBasis(basis_words, db_name, "basis")
195  corpus_nMap, corpus_vMap = mapNameToBinaryBasis(corpus_words, db_name, "corpus")
def word_pairing(words, window_size)
def remove_stopwords(text, sw)
def mapNameToBinaryBasis(words, db_name, table_name="qnlp")
def tokenize_corpus(corpus, proc_mode=0, stop_words=True)
def process(corpus_path, proc_mode=0)
def run(BasisPath, CorpusPath, proc_mode=0, db_name="qnlp_tagged_corpus")
def load_corpus(corpus_path, proc_mode=0)