QNLP  v1.0
DisCoCat.py
Go to the documentation of this file.
1 
2 
3 import sqlite3
4 import os
5 from typing import Dict, Tuple
6 import QNLP.proc.process_corpus as pc
7 import numpy as np
8 from QNLP.io.qnlp_db import qnlp_db as qnlp_db
9 
10 
11 
12 #Use mixin to modify the insert statements and the structure of database
14  def create_table_discocat(self, table_name="qnlp"):
15  """
16  Create the database table for tagging the required data. The DB has columns
17  for dataset ('basis' or 'corpus'), data_type ('verb', 'noun', etc.), token (the string value), mapping_idx (the index of the mapped binary value; for superpos states, this index labels the number of values in the superposition), map_bin_id (the binary value representing the quantum state in the register), map_coeff_r/i (real and imaginary coefficients of the map_bin_id state), mapping_dir (indicates the direction of the mapping; may not be used).
18  """
19  cr_tbl = """CREATE TABLE {}(
20  id INTEGER PRIMARY KEY,
21  dataset TEXT,
22  data_type TEXT,
23  token TEXT,
24  mapping_idx INTEGER,
25  map_bin_id INTEGER,
26  map_coeff_r REAL,
27  map_coeff_i REAL,
28  mapping_dir INTEGER
29  );""".format(table_name)
30  conn = super(qdb_mixin, self).connect_db()
31  c = conn.cursor()
32 
33  try:
34  c.execute(cr_tbl)
35 
36  except sqlite3.OperationalError as oe:
37  remove_db = input("Table '{}' already exists. Remove? y/n: ".format(table_name))
38  if remove_db is "y":
39  self.drop_table(table_name)
40  self.create_table_discocat(table_name)
41 
42  except Exception as e:
43  print("SQLITE exception thrown: {0}".format(e), "Exiting program.")
44  exit()
45 
46  finally:
47  conn.commit()
48 
49  def db_insert_discocat(self, values, dataset="basis", data_type="noun", table_name="qnlp"):
50  """
51  Insert the tag to binary encoding mapping into the DB.
52 
53  values -- Dict mapping string to binary value, and binary value to string.
54  data_type -- String to indicate the type of data to be stored
55  table_name -- Name of table to store in DB
56  """
57 
58  '''
59  The DB insert operation below assumes the value field of a key in DB is a tuple,
60  containing (binary_id, weight of occurrence), where weight of occurrence cant be
61  determined by the proximity of the word to other words; essentially a count in the
62  simplest case. The mapping checks to see if the index is convertible to a numeric
63  type. If so, this will imply the reverse mapping (ie qubit result to string val),
64  and is indicated by -1. Otherwise, this will be a forward mapping, and given by 1.
65  '''
66  conn = super(qdb_mixin, self).connect_db()
67  c = conn.cursor()
68  self.create_table_discocat(table_name)
69 
70  for corpus_token, superpos in values.items():
71  l_superpos = len(superpos)
72  for idx, (distance_measure, basis_state) in enumerate(superpos):
73  c.execute("""INSERT INTO {} (
74  dataset,
75  data_type,
76  token,
77  mapping_idx,
78  map_bin_id,
79  map_coeff_r,
80  map_coeff_i,
81  mapping_dir ) VALUES(?,?,?,?,?,?,?,?)""".format(table_name),
82  (dataset,
83  data_type,
84  corpus_token,
85  idx,
86  basis_state,
87  distance_measure.real,
88  distance_measure.imag,
89  0)
90  )
91  conn.commit()
92 
93 
94 
95 class DisCoCat:
96  """
97  Implements precomputation for the DisCo(Cat) model to represent sentence meanings
98  using category theory methods. See <PAPERS> for details.
99  """
100  def __init__(self, fd = lambda x : [1.0/(i+1) for i in x]):
101  self.distance_func = fd
102 
103  def load_corpus(self, corpus_path):
104  return pc.load_corpus(corpus_path)
105 
106  def tokenise_corpus(self, corpus_text):
107  return pc.tokenize_corpus(corpus_text)
108 
109 
110 
111  def word_occurrence(self, corpus_list : list):
112  """
113  Counts word occurrence in a given corpus, presented as a tokenised word list.
114  Returns a dictionary with keys as the tokens and values as the occurrences.
115  """
116  word_dict = {}
117  for word in corpus_list:
118  if word in word_dict:
119  word_dict[word] += 1
120  else:
121  word_dict[word] = 1
122  return word_dict
123 
124 
125 
126  def define_basis_words(self, word_dict : dict, max_words : int):
127  """
128  Chooses the max_words number of most common words from word_dict
129  and return as list for use as basis.
130  """
131  k = list(word_dict.keys())
132  v = list(word_dict.values())
133  res_list = []
134 
135  for i in range(max_words):
136  max_val = max(v)
137  val_idx = v.index(max_val)
138  res_list.append((k[val_idx],max_val))
139  k.remove(k[val_idx])
140  v.remove(max_val)
141 
142  return res_list
143 
144 
146 
147  #@multimethod
148  def map_to_basis(self, corpus_list : dict, basis : list, basis_dist_cutoff=10, distance_func=None):
149  """
150  Maps the words from the corpus into the chosen basis.
151  Returns word_map dictionary, mapping corpus tokens -> basis states
152 
153  Keyword arguments:
154  corpus_list -- List of tokens representing corpus
155  basis -- List of basis tokens
156  basis_dist_cutoff -- Cut-off for token distance from basis for it to be significant
157  distance_func -- Function accepting distance between basis and token, and
158  returning the resulting scaling. If 'None', defaults to
159  1/coeff for scaling param
160  """
161 
162  if distance_func == None:
163  distance_func = self.distance_func #lambda x : [1.0/(i+1) for i in x]
164 
165  word_map = {}
166 
167  # map distance between basis words and other words in token list
168  for word, locations in corpus_list.items():
169  word_map.update({word : None})
170  for b_idx, b_val in enumerate(basis):
171  # Basis elements are orthogonal
172  if(b_val == word):
173  word_map.update({b_val : {b_val : 0}})
174  break
175  # to add left-right ordering here, remove the abs and use sign of distance to indicate where words appear relative to one another.
176  min_dist = np.min(np.abs(locations[1][:, np.newaxis] - corpus_list[b_val][1]))
177  m = (word, b_val, min_dist <= basis_dist_cutoff)
178 
179  if m[2] != False:
180  if(word_map.get(m[0]) != None):
181  update_val = word_map.get(m[0])
182  update_val.update({m[1] : min_dist})
183  word_map.update({m[0] : update_val })
184  else:
185  word_map.update({m[0] : {m[1] : min_dist} })
186  return word_map
187 
188  def nvn_distances(self, corpus_list_n : dict, corpus_list_v : dict, dist_cutoff=2, distance_func=None):
189  """This function matches the NVN sentence structure, by locating adjacent
190  nouns and verbs, following the same procedure as used to map corpus words
191  onto the basis. With this, we can construct relationships between the
192  verbs and their subject/object nouns."""
193 
194  if distance_func == None:
195  distance_func = self.distance_func #lambda x : [1.0/(i+1) for i in x]
196 
197  word_map = {}
198 
199  # map distance between words
200  for word_v, locations_v in corpus_list_v.items():
201  for word_n, locations_n in corpus_list_n.items():
202  from IPython import embed; embed()
203 
204  dists = locations_n[1][:, np.newaxis] - locations_v[1]
205  if any([np.abs(x) <= dist_cutoff for x in dists]):
206  print("Pattern between {} and {}".format(word_n, word_v))
207  continue
208 
209  if(0):# if dist between v and noun is negative, order 1, if positive, order 2
210  word_map.update({word : None})
211 
212  # to add left-right ordering here, remove the abs and use sign of distance to indicate where words appear relative to one another.
213  min_dist = np.min(np.abs(locations[1][:, np.newaxis] - corpus_list[b_val][1]))
214  m = (word, b_val, min_dist <= basis_dist_cutoff)
215 
216  if m[2] != False:
217  if(word_map.get(m[0]) != None):
218  update_val = word_map.get(m[0])
219  update_val.update({m[1] : min_dist})
220  word_map.update({m[0] : update_val })
221  else:
222  word_map.update({m[0] : {m[1] : min_dist} })
223  return word_map
224 
225 
226 
227  def map_to_bitstring(self, basis : list):
228  upper_bound_bitstrings = int(np.ceil(np.log2(len(basis))))
229  bit_map = {}
230  bitstring = 0 # Assume |0...0> state reserved for initialisation only
231  for k, v in basis:
232  bitstring += 1
233  bit_map.update({k: bitstring})
234  return (upper_bound_bitstrings, bit_map)
235 
236 
237 
238  def generate_state_mapping(self, bit_map, dat_map):
239  """
240  Takes the basis bitstring map, and the token-to-basis relationship, and returns a normalised set of states, with coefficients determined by the distance_func lambda, given the distance between the token and the resulting basis element.
241  """
242  num_states = bit_map[0]
243 
244  # Mapping token to array of tuples, first index the basis state coefficient and second the integer representation of the bitstring state
245  state_encoding = {}
246  for token, basis_dist_map in dat_map.items():
247  local_coeffs = []
248  local_states = []
249  for basis_token, distance_list in basis_dist_map.items():
250  # If more than one occurrence for the same word, apply the distance relation function then sum the results for that basis work coefficient
251  local_coeffs.append( np.sum( self.distance_func(distance_list) ) )
252  local_states.append( bit_map[1][basis_token] )
253 
254  # Calc normalisation factor over all the respective basis states for a given token
255  norm_factor = np.linalg.norm(local_coeffs)
256  for state_idx in range( len(local_states) ):
257  # Normalise the coefficient
258  local_coeffs[state_idx] /= norm_factor
259  current = state_encoding.get(token)
260  if current != None:
261  current.append( (local_coeffs[state_idx], local_states[state_idx],) )
262  else:
263  state_encoding.update({token : [(local_coeffs[state_idx], local_states[state_idx],)] })
264  return state_encoding
265 
266 
267 
268  def latex_states(self, bit_map, dat_map, file_name = "state"):
269  """
270  LaTeX file outputter for state generation. Given the above data structures, file_name.tex is generated. Beware, as output may need to replace '_' with '\_' for non-math-mode usage.
271  """
272 
273  mapping = self.generate_state_mapping(bit_map, dat_map)
274  with open(file_name + ".tex", "w") as f:
275  f.write("\\documentclass{article} \n \\usepackage{amsmath} \\usepackage{multicol} \n \\begin{document} \n")
276  tex_string_format_bit = r'\vert {:0%db} \rangle'%(bit_map[0])
277  f.write("\\section{Basis} \\begin{multicols}{2} \n \\noindent ")
278  for b_key, b_val in bit_map[1].items():
279  f.write(b_key + " $\\rightarrow " + tex_string_format_bit.format(b_val) + "$\\\\ ")
280  f.write("\\end{multicols}")
281  f.write("\\noindent\\rule{\\textwidth}{1pt} \n")
282  f.write("\\noindent\\rule{\\textwidth}{1pt} \n")
283  f.write("\\section{Encoding} \n")
284  for token, basis_map in mapping.items():
285  f.write(r"\begin{align}\vert \textrm{" + token + "} \\rangle &= \\\\ \n &" )
286  for i,b in enumerate(basis_map):
287  if( i != 0 ):
288  if(i%3 == 0):
289  f.write(r" \\ & ")
290  f.write("{0:.3f}".format(round(b[0],3)))
291  f.write(tex_string_format_bit.format(b[1]) )
292  if(i != len(basis_map) - 1):
293  f.write(r"+")
294  f.write(" \\nonumber ")
295  f.write(r"""\end{align}""")
296  f.write("\\noindent\\rule{\\textwidth}{1pt} \n")
297  f.write(r"\end{document}")
298 
299 
def nvn_distances(self, dict corpus_list_n, dict corpus_list_v, dist_cutoff=2, distance_func=None)
Definition: DisCoCat.py:188
def db_insert_discocat(self, values, dataset="basis", data_type="noun", table_name="qnlp")
Definition: DisCoCat.py:49
def drop_table(self, table_name="qnlp")
Definition: qnlp_db.py:57
def tokenise_corpus(self, corpus_text)
Definition: DisCoCat.py:106
def define_basis_words(self, dict word_dict, int max_words)
Definition: DisCoCat.py:126
def __init__(self, fd=lambda x :[1.0/(i+1) for i in x])
Definition: DisCoCat.py:100
def load_corpus(self, corpus_path)
Definition: DisCoCat.py:103
def connect_db(self)
Definition: qnlp_db.py:71
def map_to_basis(self, dict corpus_list, list basis, basis_dist_cutoff=10, distance_func=None)
from multimethod import multimethod #Allow multiple dispatch
Definition: DisCoCat.py:148
def create_table_discocat(self, table_name="qnlp")
Definition: DisCoCat.py:14
def generate_state_mapping(self, bit_map, dat_map)
Definition: DisCoCat.py:238
def word_occurrence(self, list corpus_list)
Definition: DisCoCat.py:111
def map_to_bitstring(self, list basis)
Definition: DisCoCat.py:227
def latex_states(self, bit_map, dat_map, file_name="state")
Definition: DisCoCat.py:268