QNLP  v1.0
VectorSpaceModel.py
Go to the documentation of this file.
1 
3 """Example usage:
4 
5 from QNLP import *
6 vsm_verbs = VectorSpaceModel.VectorSpaceModel("file.txt")
7 vsm_verbs.sort_tokens_by_dist("verbs")
8 vsm_verbs.assign_indexing()
9 """
10 
12 import spacy
13 from spacy.tokenizer import Tokenizer
14 from spacy.lang.en import English
15 
16 import QNLP.proc.process_corpus as pc
17 import QNLP.encoding as enc
18 
19 from os import path
20 import numpy as np
21 import networkx as nx
22 import itertools
23 
24 from ortools.constraint_solver import routing_enums_pb2
25 from ortools.constraint_solver import pywrapcp
26 
27 class VSM_pc:
28  def __init__(self):
29  self.pc = pc
30 
31  def tokenize_corpus(self, corpus, proc_mode=0, stop_words=True, use_spacy=False):
32  """
33  Rewrite of pc.tokenize_corpus to allow for tracking of basis word
34  positions in list to improve later pairwise distance calculations.
35  """
36  token_sents = []
37  token_words = [] # Individual words
38  tags = [] # Words and respective tags
39  tagged_tokens = []
40 
41  if use_spacy == False:
42  token_sents = self.pc.nltk.sent_tokenize(corpus) #Split on sentences
43 
44  for s in token_sents:
45  tk = self.pc.nltk.word_tokenize(s)
46  if stop_words == False:
47  tk = self.pc.remove_stopwords(tk, self.pc.sw)
48  token_words.extend(tk)
49  tags.extend(self.pc.nltk.pos_tag(tk))
50 
51  if proc_mode != 0:
52  if proc_mode == 's':
53  s = self.pc.nltk.SnowballStemmer('english', ignore_stopwords = not stop_words)
54  token_words = [s.stem(t) for t in token_words]
55  elif proc_mode == 'l':
56  wnl = self.pc.nltk.WordNetLemmatizer()
57  token_words = [wnl.lemmatize(t) for t in token_words]
58 
59  tagged_tokens = self.pc.nltk.pos_tag(token_words)
60 
61  #spacy_tokenizer = English()
62  else: #using spacy
63  spacy_pos_tagger = spacy.load("en_core_web_sm")
64  #spacy_pos_tagger = 2000000 #Uses approx 1GB memory for each 100k tokens; assumes large memory pool
65  for s in spacy_pos_tagger(corpus):
66  if stop_words == False and s.is_stop:
67  continue
68  else:
69  text_val = s.text
70  if proc_mode != 0:
71  if proc_mode == 's':
72  raise Exception("Stemming not currently supported by spacy")
73  elif proc_mode == 'l':
74  text_val = s.lemma_
75 
76  text_val = text_val.lower()
77  token_words.append(text_val)
78  tags.append((text_val, s.pos_))
79  tagged_tokens = tags
80 
81  nouns = self._get_token_position(tagged_tokens, self.pc.tg.Noun)
82  verbs = self._get_token_position(tagged_tokens, self.pc.tg.Verb)
83 
84  count_nouns = { k:(v.size,v) for k,v in nouns.items()}
85  count_verbs = { k:(v.size,v) for k,v in verbs.items()}
86 
87  return {'verbs':count_verbs, 'nouns':count_nouns, 'tk_sentence':token_sents, 'tk_words':token_words}
88 
89  def _get_token_position(self, tagged_tokens, token_type):
90  """ Tracks the positions where a tagged element is found in
91  the tokenised corpus list. Useful for comparing distances.
92  If the key doesn't initially exist, it adds a list with a
93  single element. Otherwise, extends the list with the new
94  token position value.
95  """
96  token_dict = {}
97  for pos, token in enumerate(tagged_tokens):
98  if pc.tg.matchables(token_type, token[1]):
99  if isinstance(token_dict.get(token[0]), type(None)):
100  token_dict.update( { token[0] : np.array([pos])} )
101  else:
102  token_dict.update( { token[0] : np.append(token_dict.get(token[0]), pos) } )
103  return token_dict
104 
105 
107 
109  """
110  Use vector space model of meaning to determine relative order of tokens in basis (see disco papers).
111  Plan:
112 
113  - 1. Populate set of tokens of type t in corpus; label as T; O(n)
114  - 2. Choose n top most occurring tokens from T, and define as basis elements of type t; 2 <= n <= |T|; O(n)
115  - 3. Find relative (pairwise) distances between these tokens, and define as metric; n(n-1)/2 -> O(n^2)
116  - 4. Sort tokens by distance metric; O(n*log(n))
117  - 5. Assign tokens integer ID using Gray code mapping of sorted index; O(n)
118 
119  After the aboves steps the elements are readily available for encoding using the respective ID. Tokens that appear
120  relatively close together will be closer in the sorted list, and as a result have a smaller number of bit flips of difference
121  which can be used in the Hamming distance calculation later for similarity of meanings.
122  """
123  def __init__(self, corpus_path="", mode=0, stop_words=True, encoder=enc.gray.GrayEncoder(), use_spacy=False):
124  self.pc = VSM_pc()
125  self.tokens = self.load_tokens(corpus_path, mode, stop_words, use_spacy)
126  self.encoder = encoder
128  self.encoded_tokens = {}
129  self.ordered_tokens = {}
130 
131 
133 
134  def load_tokens(self, corpus_path, mode=0, stop_words=True, use_spacy=False):
135  " 1. Wraps the calls from process_corpus.py to tokenize. Returns None if path is "
136  if path.exists(corpus_path):
137  return self.pc.tokenize_corpus( pc.load_corpus(corpus_path), mode, stop_words, use_spacy)
138  else:
139  raise Exception("No corpus found at the given path.")
140 
141 
143 
144  def sort_basis_helper(self, token_type, num_elems):
145  basis_dict = {token_type : {} }
146  #consider Counter.most_common(num_elems)
147  for counter, elem in enumerate(sorted(self.tokens[token_type].items(), key=lambda x : x[1][0], reverse = True)):
148  if counter >= int(num_elems):
149  break
150  basis_dict[token_type].update({elem[0] : elem[1]})
151  return basis_dict
152 
153 
155 
156  def define_basis(self, num_basis = {"verbs": 8, "nouns":8}):
157  """ 2. Specify the number of basis elements in each space.
158  Dict holds keys of type and values of number of elements to request."""
159  if self.tokens == None:
160  return
161  basis = {}
162  for t in ("nouns", "verbs"):
163  basis.update( self.sort_basis_helper(t, num_basis[t]) )
164 
165  return basis
166 
167 
169 
170  # Use entire token list; very expensive
171  def sort_tokens_by_dist(self, tokens_type, graph_type = nx.DiGraph, dist_metric = lambda x,y : np.abs(x[:, np.newaxis] - y) ):
172  " 3. & 4."
173  tk_list = list(self.tokens[tokens_type].keys())
174  dist_dict = {}
175  # pairwise distance calc
176  for c0,k0 in enumerate(tk_list[0:-1]):
177  for k1 in tk_list[c0:]:
178  if k0 != k1:
179  a = self.tokens[tokens_type][k0][1]
180  b = self.tokens[tokens_type][k1][1]
181 
182  dist_dict[(k0,k1)] = np.min(dist_metric(a,b)) # list(map(np.unique, dist_metric(a,b)))
183  #sorted([ dist_metric(i,j) for i in self.tokens[tokens_type][k0][1] for j in self.tokens[tokens_type][k1][1] ])
184 
185  self.distance_dictionary.update( { tokens_type : dist_dict} )
186 
187  """ Maps the tokens into a fully connected digraph, where each token
188  is a node, and the weighted edge between them holds their
189  respective distance. In the event of multiple distances between
190  two node, assumes the minimum of the list.
191  """
192  token_graph = self._create_token_graph(dist_dict, graph_type)
193  """ Using the token graph, a Hamiltonian path (cycle if [0]
194  connected to [-1]) is found for the graph, wherein the ordering gives
195  the shortest path connecting each node, visited once. This gives a
196  sufficiently ordered list for the encoding values.
197  """
198 
199  self.ordered_tokens.update( { tokens_type : self._get_ordered_tokens(token_graph) } )
200  return self.ordered_tokens
201 
202  def sort_basis_tokens_by_dist(self, tokens_type, graph_type = nx.DiGraph, dist_metric = lambda x,y : np.abs(x[:, np.newaxis] - y), num_basis = 16, ham_cycle = True):
203  " 3. & 4."
204  basis_tk_list = list(self.define_basis(num_basis={"verbs":num_basis, "nouns" : num_basis})[tokens_type].keys())
205 
206  basis_dist_dict = {}
207  # pairwise distance calc
208  if len(basis_tk_list) > 1:
209  for c0,k0 in enumerate(basis_tk_list[0:]):
210  for k1 in basis_tk_list[c0:]:
211  if k0 != k1:
212  a = self.tokens[tokens_type][k0][1]
213  b = self.tokens[tokens_type][k1][1]
214 
215  basis_dist_dict[(k0,k1)] = np.min(dist_metric(a,b)) # list(map(np.unique, dist_metric(a,b)))
216  #sorted([ dist_metric(i,j) for i in self.tokens[tokens_type][k0][1] for j in self.tokens[tokens_type][k1][1] ])
217  else:
218  basis_dist_dict.update({ (basis_tk_list[0],basis_tk_list[0]) : 0})
219 
220  self.distance_dictionary.update( {tokens_type : basis_dist_dict } )
221 
222  """ Maps the tokens into a fully connected digraph, where each token
223  is a node, and the weighted edge between them holds their
224  respective distance. In the event of multiple distances between
225  two node, assumes the minimum of the list.
226  """
227  token_graph = self._create_token_graph(basis_dist_dict, graph_type)
228  """ Using the token graph, a Hamiltonian path (cycle if [0]
229  connected to [-1]) is found for the graph, wherein the ordering gives
230  the shortest path connecting each node, visited once. This gives a
231  sufficiently ordered list for the encoding values.
232  """
233 
234  self.ordered_tokens.update( {tokens_type : self._get_ordered_tokens(token_graph, ham_cycle) } )
235  return self.ordered_tokens
236 
237 
238 
239  def _create_token_graph(self, token_dist_pairs, graph_type=nx.DiGraph):
240  """
241  Creates graph using the (basis) tokens as nodes, and the pairwise
242  distances between them as weighted edges. Used to determine optimal
243  ordering of token adjacency for later encoding.
244  """
245  # Following: https://www.datacamp.com/community/tutorials/networkx-python-graph-tutorial
246 
247  assert( callable(graph_type) )
248 
249  token_graph = graph_type()
250 
251  for tokens_tuple, distances in token_dist_pairs.items():
252  # Prevent multiple adds. Should be no prob in any case
253  if not token_graph.has_node(tokens_tuple[0]):
254  token_graph.add_node(tokens_tuple[0])
255  if not token_graph.has_node(tokens_tuple[1]):
256  token_graph.add_node(tokens_tuple[1])
257 
258  # If multigraph allowed, add multiple edges between nodes.
259  # If not, use the min distance.
260  if graph_type == nx.MultiGraph:
261 
262  if not isinstance(distances, int):
263  for d in distances:
264  token_graph.add_edge( tokens_tuple[0], tokens_tuple[1], weight=d )
265  else:
266  token_graph.add_edge( tokens_tuple[0], tokens_tuple[1], weight=distances )
267 
268  else:
269  d_val = np.min(distances) if not isinstance(distances, int) else distances
270  token_graph.add_edge( tokens_tuple[0], tokens_tuple[1], weight=d_val )
271 
272  if graph_type == nx.DiGraph:
273  token_graph.add_edge( tokens_tuple[1], tokens_tuple[0], weight=d_val )
274 
275  return token_graph
276 
277 
278 
279  def _get_ordered_tokens(self, token_graph : nx.DiGraph, ham_cycle = True):
280  """
281  Solves the Hamiltonian cycle problem to define the ordering of the basis tokens.
282  If a cycle is not required, can solve the TSP problem instead (ham_cycle = False).
283  """
284  #Must be a directed graph
285  assert( isinstance(token_graph, nx.DiGraph) )
286  #Must be fully connected
287  assert( nx.tournament.is_strongly_connected(token_graph) )
288  if ham_cycle:
289  return nx.tournament.hamiltonian_path(token_graph)
290  else:
291  token_order, dist_total = self._tsp_token_solver(token_graph)
292  return token_order[:]
293 
294 
295 
296  def _tsp_token_solver(self, token_graph : nx.DiGraph):
297  """
298  Using or-tools to solve TSP of token_graph.
299  Adapted from or-tools examples on TSP
300  """
301  dist_dat = {}
302  dist_dat['distance_matrix'] = nx.adjacency_matrix(token_graph).todense().tolist() #Doesn't seem to like np matrices
303  dist_dat['num_vehicles'] = 1 #Single route
304  dist_dat['depot'] = 0 #Starting position at index 0; considering trying alternative options
305 
306  mgr = pywrapcp.RoutingIndexManager(
307  len(dist_dat['distance_matrix']),
308  dist_dat['num_vehicles'], dist_dat['depot']
309  )
310  routing = pywrapcp.RoutingModel(mgr)
311 
312  def dist_matrix_val(idx_0, idx_1):
313  """Returns value from distance matrix between nodes"""
314  n0 = mgr.IndexToNode(idx_0)
315  n1 = mgr.IndexToNode(idx_1)
316  return dist_dat['distance_matrix'][n0][n1]
317 
318  transit_callback_index = routing.RegisterTransitCallback(dist_matrix_val)
319 
320  routing.SetArcCostEvaluatorOfAllVehicles(transit_callback_index)
321 
322  # Setting first solution heuristic.
323  search_parameters = pywrapcp.DefaultRoutingSearchParameters()
324  search_parameters.first_solution_strategy = (
325  routing_enums_pb2.FirstSolutionStrategy.PATH_CHEAPEST_ARC)
326 
327  # Solve the problem.
328  assignment = routing.SolveWithParameters(search_parameters)
329 
330  dist_total = assignment.ObjectiveValue()
331 
332  index = routing.Start(0)
333  plan_output = ''
334  token_order = []
335  token_list = list(token_graph)
336 
337  while not routing.IsEnd(index):
338  token_order.append(token_list[mgr.IndexToNode(index)])
339  previous_index = index
340  index = assignment.Value(routing.NextVar(index))
341  return token_order, dist_total
342 
343 
344 
345  def _calc_token_order_distance(self, token_order_list, token_type):
346  sum_total = []
347  for idx in range(1, len(token_order_list)):
348  # May be ordered either way
349  v0 = self.distance_dictionary.get(token_type).get( ( token_order_list[idx-1], token_order_list[idx] ) )
350  v1 = self.distance_dictionary.get(token_type).get( ( token_order_list[idx], token_order_list[idx-1] ) )
351  if v0 == None:
352  sum_total.append( np.min(v1) )
353  else:
354  sum_total.append( np.min(v0) )
355 
356  return (np.sum(sum_total), sum_total)
357 
358 
359 
360  def assign_indexing(self, token_type):
361  """ 5. Encode the ordered tokens using a code based on indexed
362  location. Values close together will have fewer bit flips.
363  """
364  t_dict = {}
365 
366  for idx,token in enumerate( self.ordered_tokens[token_type]):
367  t_dict.update({token : self.encoder.encode(idx, token_type) })
368 
369  self.encoded_tokens.update( {token_type : t_dict })
370 
371  return self.encoded_tokens
372 
373 
374 
375  def calc_diff_matrix(self):
376  "WIP"
377  return
378  X = np.zeros([len(a.tokens['verbs'])]*2)
379 
380  # Add forward and inverse mapping in dictionary
381  tup_map_f = {k:i for i, k in enumerate(self.tokens['verbs'].keys()) }
382  tup_map_i = {i:k for i, k in enumerate(self.tokens['verbs'].keys()) }
383 
384  def getPathLength(self, token_type):
385  "Calculate cumulative path length of resulting basis ordering"
386  total = 0
387  for i in range(len(self.ordered_tokens)-1):
388  try:
389  total += self.distance_dictionary[token_type][( self.ordered_tokens[token_type][ i ], self.ordered_tokens[token_type][ i+1 ])]
390  except:
391  total += self.distance_dictionary[token_type][( self.ordered_tokens[token_type][ i+1 ], self.ordered_tokens[token_type][ i ])]
392  return total
def load_tokens(self, corpus_path, mode=0, stop_words=True, use_spacy=False)
def _calc_token_order_distance(self, token_order_list, token_type)
def sort_basis_helper(self, token_type, num_elems)
def _create_token_graph(self, token_dist_pairs, graph_type=nx.DiGraph)
def define_basis(self, num_basis={"verbs":8, "nouns":8})
def __init__(self, corpus_path="", mode=0, stop_words=True, encoder=enc.gray.GrayEncoder(), use_spacy=False)
def _get_ordered_tokens(self, nx.DiGraph token_graph, ham_cycle=True)
def remove_stopwords(text, sw)
def tokenize_corpus(corpus, proc_mode=0, stop_words=True)
def sort_basis_tokens_by_dist(self, tokens_type, graph_type=nx.DiGraph, dist_metric=lambda x, np.abs(x[:, np.newaxis] - y) y, num_basis=16, ham_cycle=True)
def tokenize_corpus(self, corpus, proc_mode=0, stop_words=True, use_spacy=False)
def _tsp_token_solver(self, nx.DiGraph token_graph)
def _get_token_position(self, tagged_tokens, token_type)
def sort_tokens_by_dist(self, tokens_type, graph_type=nx.DiGraph, dist_metric=lambda x, np.abs(x[:, np.newaxis] - y) y)