QNLP  v1.0
QNLP.proc.process_corpus Namespace Reference

Functions

def remove_stopwords (text, sw)
 
def tokenize_corpus (corpus, proc_mode=0, stop_words=True)
 
def load_corpus (corpus_path, proc_mode=0)
 
def process (corpus_path, proc_mode=0)
 
def word_pairing (words, window_size)
 
def num_qubits (words)
 
def mapNameToBinaryBasis (words, db_name, table_name="qnlp")
 
def run (BasisPath, CorpusPath, proc_mode=0, db_name="qnlp_tagged_corpus")
 

Variables

 sw
 

Function Documentation

◆ load_corpus()

def QNLP.proc.process_corpus.load_corpus (   corpus_path,
  proc_mode = 0 
)
Load the corpus from disk.

Definition at line 65 of file process_corpus.py.

65 def load_corpus(corpus_path, proc_mode=0):
66  """Load the corpus from disk."""
67  corpus_text=""
68  with open(corpus_path, 'r') as corpusFile:
69  corpus_text=corpusFile.read()
70  return corpus_text
71 
def load_corpus(corpus_path, proc_mode=0)

Referenced by QNLP.proc.process_corpus.process().

Here is the caller graph for this function:

◆ mapNameToBinaryBasis()

def QNLP.proc.process_corpus.mapNameToBinaryBasis (   words,
  db_name,
  table_name = "qnlp" 
)
Maps the unique string in each respective space to a binary basis number for qubit representation.

Keyword arguments:
words       -- list of the tokenized words
db_name     -- name of the database file
table_name  -- name of the table to store data in db_name

Definition at line 135 of file process_corpus.py.

135 def mapNameToBinaryBasis(words, db_name, table_name="qnlp"):
136  """
137  Maps the unique string in each respective space to a binary basis number for qubit representation.
138 
139  Keyword arguments:
140  words -- list of the tokenized words
141  db_name -- name of the database file
142  table_name -- name of the table to store data in db_name
143  """
144 
145  _, num_q_total, num_q_n, num_q_v = num_qubits(words)
146 
147  verbMap = {}
148  nounMap = {}
149  for i,v in enumerate(words['nouns']):
150  #Perform two-way mapping for bin to value and value to bin
151  nounMap.update({v:format(i,'b').zfill(num_q_n)})
152  nounMap.update({format(i,'b').zfill(num_q_n):v})
153 
154  for i,v in enumerate(words['verbs']):
155  verbMap.update({v:format(i,'b').zfill(num_q_v)})
156  verbMap.update({format(i,'b').zfill(num_q_v):v})
157 
158  db = qdb.qnlp_db(db_name, ".")
159  db.db_insert(nounMap, "noun", table_name)
160  db.db_insert(verbMap, "verb", table_name)
161 
162  return (nounMap, verbMap)
163 
def mapNameToBinaryBasis(words, db_name, table_name="qnlp")
int num_qubits
Definition: simple_MPI.py:10

References QNLP.proc.process_corpus.num_qubits().

Referenced by QNLP.proc.process_corpus.run().

Here is the call graph for this function:
Here is the caller graph for this function:

◆ num_qubits()

def QNLP.proc.process_corpus.num_qubits (   words)
Calculate the required number of qubits to encode the NVN state-space of the corpus.
Returns tuple of required states and number of needed qubits. 
While it is possible to reduce qubit count by considering the total required number as being the product of each combo, it is easier to deal with separate states for nouns and verbs, with at most 1 extra qubit required.

Keyword arguments:
words       -- list of the tokenized words

Definition at line 116 of file process_corpus.py.

116 def num_qubits(words):
117  """
118  Calculate the required number of qubits to encode the NVN state-space of the corpus.
119  Returns tuple of required states and number of needed qubits.
120  While it is possible to reduce qubit count by considering the total required number as being the product of each combo, it is easier to deal with separate states for nouns and verbs, with at most 1 extra qubit required.
121 
122  Keyword arguments:
123  words -- list of the tokenized words
124  """
125  nvn_space_size = len(words['nouns'])**2 * len(words['verbs'])
126  req_qubits_n = int(np.ceil(np.log2(len(words['nouns']))))
127  req_qubits_v = int(np.ceil(np.log2(len(words['verbs']))))
128 
129  #req_qubits = int(np.ceil(np.log2(nvn_space_size)))
130  req_qubits = req_qubits_n*2 + req_qubits_v
131 
132  print("Unique states:",nvn_space_size,"\tRequired qubits total:", req_qubits, "\tRequired qubits nouns:", req_qubits_n, "\tRequired qubits verbs:", req_qubits_v)
133  return (nvn_space_size, req_qubits, req_qubits_n, req_qubits_v)
134 
int num_qubits
Definition: simple_MPI.py:10

Referenced by QNLP.proc.process_corpus.mapNameToBinaryBasis(), and QNLP.proc.process_corpus.run().

Here is the caller graph for this function:

◆ process()

def QNLP.proc.process_corpus.process (   corpus_path,
  proc_mode = 0 
)
Tokenize the corpus data.

Definition at line 72 of file process_corpus.py.

72 def process(corpus_path, proc_mode=0):
73  """Tokenize the corpus data."""
74  words = tokenize_corpus(load_corpus(corpus_path), proc_mode=proc_mode)
75  return words
76 
def tokenize_corpus(corpus, proc_mode=0, stop_words=True)
def process(corpus_path, proc_mode=0)
def load_corpus(corpus_path, proc_mode=0)

References QNLP.proc.process_corpus.load_corpus(), and QNLP.proc.process_corpus.tokenize_corpus().

Here is the call graph for this function:

◆ remove_stopwords()

def QNLP.proc.process_corpus.remove_stopwords (   text,
  sw 
)
Remove words that do not add to the meaning; simplifies sentences

Definition at line 19 of file process_corpus.py.

19 def remove_stopwords(text, sw):
20  """Remove words that do not add to the meaning; simplifies sentences"""
21  return [w for w in text if w not in sw]
22 
def remove_stopwords(text, sw)

Referenced by QNLP.proc.process_corpus.tokenize_corpus(), and QNLP.proc.VectorSpaceModel.VSM_pc.tokenize_corpus().

Here is the caller graph for this function:

◆ run()

def QNLP.proc.process_corpus.run (   BasisPath,
  CorpusPath,
  proc_mode = 0,
  db_name = "qnlp_tagged_corpus" 
)

Definition at line 167 of file process_corpus.py.

167 def run(BasisPath, CorpusPath, proc_mode=0, db_name="qnlp_tagged_corpus"):
168  # Load the basis words
169  basis_text=""
170  # Load the corpus
171  corpus_text=""
172 
173  with open(BasisPath, 'r') as basisFile:
174  basis_text=basisFile.read()
175 
176  with open(CorpusPath, 'r') as corpusFile:
177  corpus_text=corpusFile.read()
178 
179  basis_words = tokenize_corpus(basis_text, proc_mode=proc_mode)
180  corpus_words = tokenize_corpus(corpus_text, proc_mode=proc_mode)
181 
182  #Naively set sentence boundary at location of full-stops.
183  sentence_boundary = set([i for i, x in enumerate(corpus_words['tk_words']) if "." in x])
184 
185  # examine word windowing for proximity
186  #window = (2,3)
187  window=1
188  word_pairing(basis_words, window)
189 
190  # Determine qubit requirements
191  _, num_q_total, num_q_n, num_q_v = num_qubits(basis_words)
192 
193  #Map basis words to binary strings, and populate DB
194  basis_nMap, basis_vMap = mapNameToBinaryBasis(basis_words, db_name, "basis")
195  corpus_nMap, corpus_vMap = mapNameToBinaryBasis(corpus_words, db_name, "corpus")
def word_pairing(words, window_size)
def mapNameToBinaryBasis(words, db_name, table_name="qnlp")
def tokenize_corpus(corpus, proc_mode=0, stop_words=True)
int num_qubits
Definition: simple_MPI.py:10
def run(BasisPath, CorpusPath, proc_mode=0, db_name="qnlp_tagged_corpus")

References QNLP.proc.process_corpus.mapNameToBinaryBasis(), QNLP.proc.process_corpus.num_qubits(), QNLP.proc.process_corpus.tokenize_corpus(), and QNLP.proc.process_corpus.word_pairing().

Here is the call graph for this function:

◆ tokenize_corpus()

def QNLP.proc.process_corpus.tokenize_corpus (   corpus,
  proc_mode = 0,
  stop_words = True 
)
Pass the corpus as a string, which is subsequently broken into tokenized sentences, and returned as dictionary of verbs, nouns, tokenized words, and tokenized sentences.

Keyword arguments:
corpus      -- string representing the corpus to tokenize
proc_mode   -- defines the processing mode. Lemmatization: proc_mode=\"l\";  Stemming: proc_mode=\"s\"; No additional processing: proc_mode=0 (default=0)
stop_words  -- indicates whether stop words should be removed (False) or kept (True) (from nltk.corpus.stopwords.words(\"english\"))

Definition at line 25 of file process_corpus.py.

25 def tokenize_corpus(corpus, proc_mode=0, stop_words=True):
26  """
27  Pass the corpus as a string, which is subsequently broken into tokenized sentences, and returned as dictionary of verbs, nouns, tokenized words, and tokenized sentences.
28 
29  Keyword arguments:
30  corpus -- string representing the corpus to tokenize
31  proc_mode -- defines the processing mode. Lemmatization: proc_mode=\"l\"; Stemming: proc_mode=\"s\"; No additional processing: proc_mode=0 (default=0)
32  stop_words -- indicates whether stop words should be removed (False) or kept (True) (from nltk.corpus.stopwords.words(\"english\"))
33  """
34 
35  token_sents = nltk.sent_tokenize(corpus) #Split on sentences
36  token_words = [] # Individual words
37  tags = [] # Words and respective tags
38 
39  for s in token_sents:
40  tk = nltk.word_tokenize(s)
41  if stop_words == False:
42  tk = remove_stopwords(tk, sw)
43  token_words.extend(tk)
44  tags.extend(nltk.pos_tag(tk))
45 
46  if proc_mode != 0:
47  if proc_mode == 's':
48  s = nltk.SnowballStemmer('english', ignore_stopwords = not stop_words)
49  token_words = [s.stem(t) for t in token_words]
50  elif proc_mode == 'l':
51  wnl = nltk.WordNetLemmatizer()
52  token_words = [wnl.lemmatize(t) for t in token_words]
53 
54  tags = nltk.pos_tag(token_words)
55  nouns = [i[0] for i in tags if tg.matchables(tg.Noun, i[1])]
56  verbs = [i[0] for i in tags if tg.matchables(tg.Verb, i[1])]
57 
58  count_nouns = Counter(nouns)
59  count_verbs = Counter(verbs)
60 
61  return {'verbs':count_verbs, 'nouns':count_nouns, 'tk_sentence':token_sents, 'tk_words':token_words}
62 
def remove_stopwords(text, sw)
def tokenize_corpus(corpus, proc_mode=0, stop_words=True)

References QNLP.proc.process_corpus.remove_stopwords().

Referenced by QNLP.proc.VectorSpaceModel.VectorSpaceModel.load_tokens(), QNLP.proc.process_corpus.process(), and QNLP.proc.process_corpus.run().

Here is the call graph for this function:
Here is the caller graph for this function:

◆ word_pairing()

def QNLP.proc.process_corpus.word_pairing (   words,
  window_size 
)
Examine the words around each verb with the specified window size, and attempt to match the NVN pattern. The window_size specifies the number of values around each verb to search for the matching nouns. If passed as tuple
(l,r) gives the left and right windows separately. If passed as a scalar, both values are equal.

Keyword arguments:
words       -- list of the tokenized words
window_size -- window to search for word pairings. Tuple

Definition at line 79 of file process_corpus.py.

79 def word_pairing(words, window_size):
80  """
81  Examine the words around each verb with the specified window size, and attempt to match the NVN pattern. The window_size specifies the number of values around each verb to search for the matching nouns. If passed as tuple
82  (l,r) gives the left and right windows separately. If passed as a scalar, both values are equal.
83 
84  Keyword arguments:
85  words -- list of the tokenized words
86  window_size -- window to search for word pairings. Tuple
87  """
88 
89  #Naively set sentence boundary at location of full-stops.
90  sentence_boundary = set([i for i, x in enumerate(words['tk_words']) if "." in x])
91 
92  if type(window_size) is tuple:
93  window_left, window_right = window_size
94  else:
95  window_left = window_size
96  window_right = window_size
97  verb_idx={}
98 
99  for v in words['verbs']:
100  verb_idx.update({v:[i for i, x in enumerate(words['tk_words']) if v in x]})
101 
102  for n in words['nouns']:
103  for v,k in verb_idx.items():
104  for v_idx in k:
105  #Ensure that no full stop appears over the windowed region to avoid crossing sentences.
106  l_left = len(sentence_boundary.intersection(range(v_idx-window_left,v_idx)))
107  l_right = len(sentence_boundary.intersection(range(v_idx,v_idx+window_right)))
108 
109  if l_left==0 and n in words['tk_words'][v_idx-window_left:v_idx]:
110  print("LEFT:=",n,v)
111  if l_right==0 and (v_idx + window_right) < len(words['tk_words']) and n in words['tk_words'][v_idx:v_idx+window_right] :
112  print("RIGHT:=",v,n)
113 
def word_pairing(words, window_size)

Referenced by QNLP.proc.process_corpus.run().

Here is the caller graph for this function:

Variable Documentation

◆ sw

QNLP.proc.process_corpus.sw

Definition at line 15 of file process_corpus.py.