5 from typing
import Dict, Tuple
16 Create the database table for tagging the required data. The DB has columns 17 for dataset ('basis' or 'corpus'), data_type ('verb', 'noun', etc.), token (the string value), mapping_idx (the index of the mapped binary value; for superpos states, this index labels the number of values in the superposition), map_bin_id (the binary value representing the quantum state in the register), map_coeff_r/i (real and imaginary coefficients of the map_bin_id state), mapping_dir (indicates the direction of the mapping; may not be used). 19 cr_tbl =
"""CREATE TABLE {}( 20 id INTEGER PRIMARY KEY, 29 );""".format(table_name)
36 except sqlite3.OperationalError
as oe:
37 remove_db = input(
"Table '{}' already exists. Remove? y/n: ".format(table_name))
42 except Exception
as e:
43 print(
"SQLITE exception thrown: {0}".format(e),
"Exiting program.")
51 Insert the tag to binary encoding mapping into the DB. 53 values -- Dict mapping string to binary value, and binary value to string. 54 data_type -- String to indicate the type of data to be stored 55 table_name -- Name of table to store in DB 59 The DB insert operation below assumes the value field of a key in DB is a tuple, 60 containing (binary_id, weight of occurrence), where weight of occurrence cant be 61 determined by the proximity of the word to other words; essentially a count in the 62 simplest case. The mapping checks to see if the index is convertible to a numeric 63 type. If so, this will imply the reverse mapping (ie qubit result to string val), 64 and is indicated by -1. Otherwise, this will be a forward mapping, and given by 1. 70 for corpus_token, superpos
in values.items():
71 l_superpos = len(superpos)
72 for idx, (distance_measure, basis_state)
in enumerate(superpos):
73 c.execute(
"""INSERT INTO {} ( 81 mapping_dir ) VALUES(?,?,?,?,?,?,?,?)""".format(table_name),
87 distance_measure.real,
88 distance_measure.imag,
97 Implements precomputation for the DisCo(Cat) model to represent sentence meanings 98 using category theory methods. See <PAPERS> for details. 100 def __init__(self, fd = lambda x : [1.0/(i+1)
for i
in x]):
104 return pc.load_corpus(corpus_path)
107 return pc.tokenize_corpus(corpus_text)
113 Counts word occurrence in a given corpus, presented as a tokenised word list. 114 Returns a dictionary with keys as the tokens and values as the occurrences. 117 for word
in corpus_list:
118 if word
in word_dict:
128 Chooses the max_words number of most common words from word_dict 129 and return as list for use as basis. 131 k = list(word_dict.keys())
132 v = list(word_dict.values())
135 for i
in range(max_words):
137 val_idx = v.index(max_val)
138 res_list.append((k[val_idx],max_val))
148 def map_to_basis(self, corpus_list : dict, basis : list, basis_dist_cutoff=10, distance_func=
None):
150 Maps the words from the corpus into the chosen basis. 151 Returns word_map dictionary, mapping corpus tokens -> basis states 154 corpus_list -- List of tokens representing corpus 155 basis -- List of basis tokens 156 basis_dist_cutoff -- Cut-off for token distance from basis for it to be significant 157 distance_func -- Function accepting distance between basis and token, and 158 returning the resulting scaling. If 'None', defaults to 159 1/coeff for scaling param 162 if distance_func ==
None:
168 for word, locations
in corpus_list.items():
169 word_map.update({word :
None})
170 for b_idx, b_val
in enumerate(basis):
173 word_map.update({b_val : {b_val : 0}})
176 min_dist = np.min(np.abs(locations[1][:, np.newaxis] - corpus_list[b_val][1]))
177 m = (word, b_val, min_dist <= basis_dist_cutoff)
180 if(word_map.get(m[0]) !=
None):
181 update_val = word_map.get(m[0])
182 update_val.update({m[1] : min_dist})
183 word_map.update({m[0] : update_val })
185 word_map.update({m[0] : {m[1] : min_dist} })
188 def nvn_distances(self, corpus_list_n : dict, corpus_list_v : dict, dist_cutoff=2, distance_func=
None):
189 """This function matches the NVN sentence structure, by locating adjacent 190 nouns and verbs, following the same procedure as used to map corpus words 191 onto the basis. With this, we can construct relationships between the 192 verbs and their subject/object nouns.""" 194 if distance_func ==
None:
200 for word_v, locations_v
in corpus_list_v.items():
201 for word_n, locations_n
in corpus_list_n.items():
202 from IPython
import embed; embed()
204 dists = locations_n[1][:, np.newaxis] - locations_v[1]
205 if any([np.abs(x) <= dist_cutoff
for x
in dists]):
206 print(
"Pattern between {} and {}".format(word_n, word_v))
210 word_map.update({word :
None})
213 min_dist = np.min(np.abs(locations[1][:, np.newaxis] - corpus_list[b_val][1]))
214 m = (word, b_val, min_dist <= basis_dist_cutoff)
217 if(word_map.get(m[0]) !=
None):
218 update_val = word_map.get(m[0])
219 update_val.update({m[1] : min_dist})
220 word_map.update({m[0] : update_val })
222 word_map.update({m[0] : {m[1] : min_dist} })
228 upper_bound_bitstrings = int(np.ceil(np.log2(len(basis))))
233 bit_map.update({k: bitstring})
234 return (upper_bound_bitstrings, bit_map)
240 Takes the basis bitstring map, and the token-to-basis relationship, and returns a normalised set of states, with coefficients determined by the distance_func lambda, given the distance between the token and the resulting basis element. 242 num_states = bit_map[0]
246 for token, basis_dist_map
in dat_map.items():
249 for basis_token, distance_list
in basis_dist_map.items():
251 local_coeffs.append( np.sum( self.
distance_func(distance_list) ) )
252 local_states.append( bit_map[1][basis_token] )
255 norm_factor = np.linalg.norm(local_coeffs)
256 for state_idx
in range( len(local_states) ):
258 local_coeffs[state_idx] /= norm_factor
259 current = state_encoding.get(token)
261 current.append( (local_coeffs[state_idx], local_states[state_idx],) )
263 state_encoding.update({token : [(local_coeffs[state_idx], local_states[state_idx],)] })
264 return state_encoding
270 LaTeX file outputter for state generation. Given the above data structures, file_name.tex is generated. Beware, as output may need to replace '_' with '\_' for non-math-mode usage. 274 with open(file_name +
".tex",
"w")
as f:
275 f.write(
"\\documentclass{article} \n \\usepackage{amsmath} \\usepackage{multicol} \n \\begin{document} \n")
276 tex_string_format_bit =
r'\vert {:0%db} \rangle'%(bit_map[0])
277 f.write(
"\\section{Basis} \\begin{multicols}{2} \n \\noindent ")
278 for b_key, b_val
in bit_map[1].items():
279 f.write(b_key +
" $\\rightarrow " + tex_string_format_bit.format(b_val) +
"$\\\\ ")
280 f.write(
"\\end{multicols}")
281 f.write(
"\\noindent\\rule{\\textwidth}{1pt} \n")
282 f.write(
"\\noindent\\rule{\\textwidth}{1pt} \n")
283 f.write(
"\\section{Encoding} \n")
284 for token, basis_map
in mapping.items():
285 f.write(
r"\begin{align}\vert \textrm{" + token +
"} \\rangle &= \\\\ \n &" )
286 for i,b
in enumerate(basis_map):
290 f.write(
"{0:.3f}".format(round(b[0],3)))
291 f.write(tex_string_format_bit.format(b[1]) )
292 if(i != len(basis_map) - 1):
294 f.write(
" \\nonumber ")
295 f.write(
r"""\end{align}""")
296 f.write(
"\\noindent\\rule{\\textwidth}{1pt} \n")
297 f.write(
r"\end{document}")
def nvn_distances(self, dict corpus_list_n, dict corpus_list_v, dist_cutoff=2, distance_func=None)
def db_insert_discocat(self, values, dataset="basis", data_type="noun", table_name="qnlp")
def drop_table(self, table_name="qnlp")
def tokenise_corpus(self, corpus_text)
def define_basis_words(self, dict word_dict, int max_words)
def __init__(self, fd=lambda x :[1.0/(i+1) for i in x])
def load_corpus(self, corpus_path)
def map_to_basis(self, dict corpus_list, list basis, basis_dist_cutoff=10, distance_func=None)
from multimethod import multimethod #Allow multiple dispatch
def create_table_discocat(self, table_name="qnlp")
def generate_state_mapping(self, bit_map, dat_map)
def word_occurrence(self, list corpus_list)
def map_to_bitstring(self, list basis)
def latex_states(self, bit_map, dat_map, file_name="state")