QNLP  v1.0
QNLP.proc.DisCoCat.DisCoCat Class Reference
Collaboration diagram for QNLP.proc.DisCoCat.DisCoCat:
Collaboration graph

Public Member Functions

def __init__ (self, fd=lambda x :[1.0/(i+1) for i in x])
 
def load_corpus (self, corpus_path)
 
def tokenise_corpus (self, corpus_text)
 
def word_occurrence (self, list corpus_list)
 
def define_basis_words (self, dict word_dict, int max_words)
 
def map_to_basis (self, dict corpus_list, list basis, basis_dist_cutoff=10, distance_func=None)
 from multimethod import multimethod #Allow multiple dispatch More...
 
def nvn_distances (self, dict corpus_list_n, dict corpus_list_v, dist_cutoff=2, distance_func=None)
 
def map_to_bitstring (self, list basis)
 
def generate_state_mapping (self, bit_map, dat_map)
 
def latex_states (self, bit_map, dat_map, file_name="state")
 

Data Fields

 distance_func
 

Detailed Description

Implements precomputation for the DisCo(Cat) model to represent sentence meanings
using category theory methods. See <PAPERS> for details.

Definition at line 95 of file DisCoCat.py.

Constructor & Destructor Documentation

◆ __init__()

def QNLP.proc.DisCoCat.DisCoCat.__init__ (   self,
  fd = lambda x : [1.0/(i+1) for i in x] 
)

Definition at line 100 of file DisCoCat.py.

100  def __init__(self, fd = lambda x : [1.0/(i+1) for i in x]):
101  self.distance_func = fd
102 

Member Function Documentation

◆ define_basis_words()

def QNLP.proc.DisCoCat.DisCoCat.define_basis_words (   self,
dict  word_dict,
int  max_words 
)
Chooses the max_words number of most common words from word_dict
and return as list for use as basis.

Definition at line 126 of file DisCoCat.py.

126  def define_basis_words(self, word_dict : dict, max_words : int):
127  """
128  Chooses the max_words number of most common words from word_dict
129  and return as list for use as basis.
130  """
131  k = list(word_dict.keys())
132  v = list(word_dict.values())
133  res_list = []
134 
135  for i in range(max_words):
136  max_val = max(v)
137  val_idx = v.index(max_val)
138  res_list.append((k[val_idx],max_val))
139  k.remove(k[val_idx])
140  v.remove(max_val)
141 
142  return res_list
143 

◆ generate_state_mapping()

def QNLP.proc.DisCoCat.DisCoCat.generate_state_mapping (   self,
  bit_map,
  dat_map 
)
Takes the basis bitstring map, and the token-to-basis relationship, and returns a normalised set of states, with coefficients determined by the distance_func lambda, given the distance between the token and the resulting basis element.

Definition at line 238 of file DisCoCat.py.

238  def generate_state_mapping(self, bit_map, dat_map):
239  """
240  Takes the basis bitstring map, and the token-to-basis relationship, and returns a normalised set of states, with coefficients determined by the distance_func lambda, given the distance between the token and the resulting basis element.
241  """
242  num_states = bit_map[0]
243 
244  # Mapping token to array of tuples, first index the basis state coefficient and second the integer representation of the bitstring state
245  state_encoding = {}
246  for token, basis_dist_map in dat_map.items():
247  local_coeffs = []
248  local_states = []
249  for basis_token, distance_list in basis_dist_map.items():
250  # If more than one occurrence for the same word, apply the distance relation function then sum the results for that basis work coefficient
251  local_coeffs.append( np.sum( self.distance_func(distance_list) ) )
252  local_states.append( bit_map[1][basis_token] )
253 
254  # Calc normalisation factor over all the respective basis states for a given token
255  norm_factor = np.linalg.norm(local_coeffs)
256  for state_idx in range( len(local_states) ):
257  # Normalise the coefficient
258  local_coeffs[state_idx] /= norm_factor
259  current = state_encoding.get(token)
260  if current != None:
261  current.append( (local_coeffs[state_idx], local_states[state_idx],) )
262  else:
263  state_encoding.update({token : [(local_coeffs[state_idx], local_states[state_idx],)] })
264  return state_encoding
265 

References QNLP.proc.DisCoCat.DisCoCat.distance_func.

Referenced by QNLP.proc.DisCoCat.DisCoCat.latex_states().

Here is the caller graph for this function:

◆ latex_states()

def QNLP.proc.DisCoCat.DisCoCat.latex_states (   self,
  bit_map,
  dat_map,
  file_name = "state" 
)
LaTeX file outputter for state generation. Given the above data structures, file_name.tex is generated. Beware, as output may need to replace '_' with '\_' for non-math-mode usage.

Definition at line 268 of file DisCoCat.py.

268  def latex_states(self, bit_map, dat_map, file_name = "state"):
269  """
270  LaTeX file outputter for state generation. Given the above data structures, file_name.tex is generated. Beware, as output may need to replace '_' with '\_' for non-math-mode usage.
271  """
272 
273  mapping = self.generate_state_mapping(bit_map, dat_map)
274  with open(file_name + ".tex", "w") as f:
275  f.write("\\documentclass{article} \n \\usepackage{amsmath} \\usepackage{multicol} \n \\begin{document} \n")
276  tex_string_format_bit = r'\vert {:0%db} \rangle'%(bit_map[0])
277  f.write("\\section{Basis} \\begin{multicols}{2} \n \\noindent ")
278  for b_key, b_val in bit_map[1].items():
279  f.write(b_key + " $\\rightarrow " + tex_string_format_bit.format(b_val) + "$\\\\ ")
280  f.write("\\end{multicols}")
281  f.write("\\noindent\\rule{\\textwidth}{1pt} \n")
282  f.write("\\noindent\\rule{\\textwidth}{1pt} \n")
283  f.write("\\section{Encoding} \n")
284  for token, basis_map in mapping.items():
285  f.write(r"\begin{align}\vert \textrm{" + token + "} \\rangle &= \\\\ \n &" )
286  for i,b in enumerate(basis_map):
287  if( i != 0 ):
288  if(i%3 == 0):
289  f.write(r" \\ & ")
290  f.write("{0:.3f}".format(round(b[0],3)))
291  f.write(tex_string_format_bit.format(b[1]) )
292  if(i != len(basis_map) - 1):
293  f.write(r"+")
294  f.write(" \\nonumber ")
295  f.write(r"""\end{align}""")
296  f.write("\\noindent\\rule{\\textwidth}{1pt} \n")
297  f.write(r"\end{document}")
298 

References QNLP.proc.DisCoCat.DisCoCat.generate_state_mapping().

Here is the call graph for this function:

◆ load_corpus()

def QNLP.proc.DisCoCat.DisCoCat.load_corpus (   self,
  corpus_path 
)

Definition at line 103 of file DisCoCat.py.

103  def load_corpus(self, corpus_path):
104  return pc.load_corpus(corpus_path)
105 
def load_corpus(corpus_path, proc_mode=0)

◆ map_to_basis()

def QNLP.proc.DisCoCat.DisCoCat.map_to_basis (   self,
dict  corpus_list,
list  basis,
  basis_dist_cutoff = 10,
  distance_func = None 
)

from multimethod import multimethod #Allow multiple dispatch

Maps the words from the corpus into the chosen basis.         
Returns word_map dictionary, mapping corpus tokens -> basis states

Keyword arguments:
corpus_list         --  List of tokens representing corpus
basis               --  List of basis tokens
basis_dist_cutoff   --  Cut-off for token distance from basis for it to be significant
distance_func       --  Function accepting distance between basis and token, and
                returning the resulting scaling. If 'None', defaults to 
                1/coeff for scaling param

Definition at line 148 of file DisCoCat.py.

148  def map_to_basis(self, corpus_list : dict, basis : list, basis_dist_cutoff=10, distance_func=None):
149  """
150  Maps the words from the corpus into the chosen basis.
151  Returns word_map dictionary, mapping corpus tokens -> basis states
152 
153  Keyword arguments:
154  corpus_list -- List of tokens representing corpus
155  basis -- List of basis tokens
156  basis_dist_cutoff -- Cut-off for token distance from basis for it to be significant
157  distance_func -- Function accepting distance between basis and token, and
158  returning the resulting scaling. If 'None', defaults to
159  1/coeff for scaling param
160  """
161 
162  if distance_func == None:
163  distance_func = self.distance_func #lambda x : [1.0/(i+1) for i in x]
164 
165  word_map = {}
166 
167  # map distance between basis words and other words in token list
168  for word, locations in corpus_list.items():
169  word_map.update({word : None})
170  for b_idx, b_val in enumerate(basis):
171  # Basis elements are orthogonal
172  if(b_val == word):
173  word_map.update({b_val : {b_val : 0}})
174  break
175  # to add left-right ordering here, remove the abs and use sign of distance to indicate where words appear relative to one another.
176  min_dist = np.min(np.abs(locations[1][:, np.newaxis] - corpus_list[b_val][1]))
177  m = (word, b_val, min_dist <= basis_dist_cutoff)
178 
179  if m[2] != False:
180  if(word_map.get(m[0]) != None):
181  update_val = word_map.get(m[0])
182  update_val.update({m[1] : min_dist})
183  word_map.update({m[0] : update_val })
184  else:
185  word_map.update({m[0] : {m[1] : min_dist} })
186  return word_map
187 

References QNLP.proc.DisCoCat.DisCoCat.distance_func.

◆ map_to_bitstring()

def QNLP.proc.DisCoCat.DisCoCat.map_to_bitstring (   self,
list  basis 
)

Definition at line 227 of file DisCoCat.py.

227  def map_to_bitstring(self, basis : list):
228  upper_bound_bitstrings = int(np.ceil(np.log2(len(basis))))
229  bit_map = {}
230  bitstring = 0 # Assume |0...0> state reserved for initialisation only
231  for k, v in basis:
232  bitstring += 1
233  bit_map.update({k: bitstring})
234  return (upper_bound_bitstrings, bit_map)
235 

◆ nvn_distances()

def QNLP.proc.DisCoCat.DisCoCat.nvn_distances (   self,
dict  corpus_list_n,
dict  corpus_list_v,
  dist_cutoff = 2,
  distance_func = None 
)
This function matches the NVN sentence structure, by locating adjacent
nouns and verbs, following the same procedure as used to map corpus words 
onto the basis. With this, we can construct relationships between the
verbs and their subject/object nouns.

Definition at line 188 of file DisCoCat.py.

188  def nvn_distances(self, corpus_list_n : dict, corpus_list_v : dict, dist_cutoff=2, distance_func=None):
189  """This function matches the NVN sentence structure, by locating adjacent
190  nouns and verbs, following the same procedure as used to map corpus words
191  onto the basis. With this, we can construct relationships between the
192  verbs and their subject/object nouns."""
193 
194  if distance_func == None:
195  distance_func = self.distance_func #lambda x : [1.0/(i+1) for i in x]
196 
197  word_map = {}
198 
199  # map distance between words
200  for word_v, locations_v in corpus_list_v.items():
201  for word_n, locations_n in corpus_list_n.items():
202  from IPython import embed; embed()
203 
204  dists = locations_n[1][:, np.newaxis] - locations_v[1]
205  if any([np.abs(x) <= dist_cutoff for x in dists]):
206  print("Pattern between {} and {}".format(word_n, word_v))
207  continue
208 
209  if(0):# if dist between v and noun is negative, order 1, if positive, order 2
210  word_map.update({word : None})
211 
212  # to add left-right ordering here, remove the abs and use sign of distance to indicate where words appear relative to one another.
213  min_dist = np.min(np.abs(locations[1][:, np.newaxis] - corpus_list[b_val][1]))
214  m = (word, b_val, min_dist <= basis_dist_cutoff)
215 
216  if m[2] != False:
217  if(word_map.get(m[0]) != None):
218  update_val = word_map.get(m[0])
219  update_val.update({m[1] : min_dist})
220  word_map.update({m[0] : update_val })
221  else:
222  word_map.update({m[0] : {m[1] : min_dist} })
223  return word_map
224 

References QNLP.proc.DisCoCat.DisCoCat.distance_func.

◆ tokenise_corpus()

def QNLP.proc.DisCoCat.DisCoCat.tokenise_corpus (   self,
  corpus_text 
)

Definition at line 106 of file DisCoCat.py.

106  def tokenise_corpus(self, corpus_text):
107  return pc.tokenize_corpus(corpus_text)
108 

◆ word_occurrence()

def QNLP.proc.DisCoCat.DisCoCat.word_occurrence (   self,
list  corpus_list 
)
Counts word occurrence in a given corpus, presented as a tokenised word list.
Returns a dictionary with keys as the tokens and values as the occurrences.

Definition at line 111 of file DisCoCat.py.

111  def word_occurrence(self, corpus_list : list):
112  """
113  Counts word occurrence in a given corpus, presented as a tokenised word list.
114  Returns a dictionary with keys as the tokens and values as the occurrences.
115  """
116  word_dict = {}
117  for word in corpus_list:
118  if word in word_dict:
119  word_dict[word] += 1
120  else:
121  word_dict[word] = 1
122  return word_dict
123 

Field Documentation

◆ distance_func


The documentation for this class was generated from the following file: