6 vsm_verbs = VectorSpaceModel.VectorSpaceModel("file.txt") 7 vsm_verbs.sort_tokens_by_dist("verbs") 8 vsm_verbs.assign_indexing() 13 from spacy.tokenizer
import Tokenizer
14 from spacy.lang.en
import English
24 from ortools.constraint_solver
import routing_enums_pb2
25 from ortools.constraint_solver
import pywrapcp
33 Rewrite of pc.tokenize_corpus to allow for tracking of basis word 34 positions in list to improve later pairwise distance calculations. 41 if use_spacy ==
False:
42 token_sents = self.
pc.nltk.sent_tokenize(corpus)
45 tk = self.
pc.nltk.word_tokenize(s)
46 if stop_words ==
False:
48 token_words.extend(tk)
49 tags.extend(self.
pc.nltk.pos_tag(tk))
53 s = self.
pc.nltk.SnowballStemmer(
'english', ignore_stopwords =
not stop_words)
54 token_words = [s.stem(t)
for t
in token_words]
55 elif proc_mode ==
'l':
56 wnl = self.
pc.nltk.WordNetLemmatizer()
57 token_words = [wnl.lemmatize(t)
for t
in token_words]
59 tagged_tokens = self.
pc.nltk.pos_tag(token_words)
63 spacy_pos_tagger = spacy.load(
"en_core_web_sm")
65 for s
in spacy_pos_tagger(corpus):
66 if stop_words ==
False and s.is_stop:
72 raise Exception(
"Stemming not currently supported by spacy")
73 elif proc_mode ==
'l':
76 text_val = text_val.lower()
77 token_words.append(text_val)
78 tags.append((text_val, s.pos_))
84 count_nouns = { k:(v.size,v)
for k,v
in nouns.items()}
85 count_verbs = { k:(v.size,v)
for k,v
in verbs.items()}
87 return {
'verbs':count_verbs,
'nouns':count_nouns,
'tk_sentence':token_sents,
'tk_words':token_words}
90 """ Tracks the positions where a tagged element is found in 91 the tokenised corpus list. Useful for comparing distances. 92 If the key doesn't initially exist, it adds a list with a 93 single element. Otherwise, extends the list with the new 97 for pos, token
in enumerate(tagged_tokens):
98 if pc.tg.matchables(token_type, token[1]):
99 if isinstance(token_dict.get(token[0]), type(
None)):
100 token_dict.update( { token[0] : np.array([pos])} )
102 token_dict.update( { token[0] : np.append(token_dict.get(token[0]), pos) } )
110 Use vector space model of meaning to determine relative order of tokens in basis (see disco papers). 113 - 1. Populate set of tokens of type t in corpus; label as T; O(n) 114 - 2. Choose n top most occurring tokens from T, and define as basis elements of type t; 2 <= n <= |T|; O(n) 115 - 3. Find relative (pairwise) distances between these tokens, and define as metric; n(n-1)/2 -> O(n^2) 116 - 4. Sort tokens by distance metric; O(n*log(n)) 117 - 5. Assign tokens integer ID using Gray code mapping of sorted index; O(n) 119 After the aboves steps the elements are readily available for encoding using the respective ID. Tokens that appear 120 relatively close together will be closer in the sorted list, and as a result have a smaller number of bit flips of difference 121 which can be used in the Hamming distance calculation later for similarity of meanings. 123 def __init__(self, corpus_path="", mode=0, stop_words=True, encoder=enc.gray.GrayEncoder(), use_spacy=
False):
134 def load_tokens(self, corpus_path, mode=0, stop_words=True, use_spacy=False):
135 " 1. Wraps the calls from process_corpus.py to tokenize. Returns None if path is " 136 if path.exists(corpus_path):
137 return self.
pc.
tokenize_corpus( pc.load_corpus(corpus_path), mode, stop_words, use_spacy)
139 raise Exception(
"No corpus found at the given path.")
145 basis_dict = {token_type : {} }
147 for counter, elem
in enumerate(sorted(self.
tokens[token_type].items(), key=
lambda x : x[1][0], reverse =
True)):
148 if counter >= int(num_elems):
150 basis_dict[token_type].update({elem[0] : elem[1]})
157 """ 2. Specify the number of basis elements in each space. 158 Dict holds keys of type and values of number of elements to request.""" 162 for t
in (
"nouns",
"verbs"):
171 def sort_tokens_by_dist(self, tokens_type, graph_type = nx.DiGraph, dist_metric = lambda x,y : np.abs(x[:, np.newaxis] - y) ):
173 tk_list = list(self.
tokens[tokens_type].keys())
176 for c0,k0
in enumerate(tk_list[0:-1]):
177 for k1
in tk_list[c0:]:
179 a = self.
tokens[tokens_type][k0][1]
180 b = self.
tokens[tokens_type][k1][1]
182 dist_dict[(k0,k1)] = np.min(dist_metric(a,b))
187 """ Maps the tokens into a fully connected digraph, where each token 188 is a node, and the weighted edge between them holds their 189 respective distance. In the event of multiple distances between 190 two node, assumes the minimum of the list. 193 """ Using the token graph, a Hamiltonian path (cycle if [0] 194 connected to [-1]) is found for the graph, wherein the ordering gives 195 the shortest path connecting each node, visited once. This gives a 196 sufficiently ordered list for the encoding values. 202 def sort_basis_tokens_by_dist(self, tokens_type, graph_type = nx.DiGraph, dist_metric = lambda x,y : np.abs(x[:, np.newaxis] - y), num_basis = 16, ham_cycle =
True):
204 basis_tk_list = list(self.
define_basis(num_basis={
"verbs":num_basis,
"nouns" : num_basis})[tokens_type].keys())
208 if len(basis_tk_list) > 1:
209 for c0,k0
in enumerate(basis_tk_list[0:]):
210 for k1
in basis_tk_list[c0:]:
212 a = self.
tokens[tokens_type][k0][1]
213 b = self.
tokens[tokens_type][k1][1]
215 basis_dist_dict[(k0,k1)] = np.min(dist_metric(a,b))
218 basis_dist_dict.update({ (basis_tk_list[0],basis_tk_list[0]) : 0})
222 """ Maps the tokens into a fully connected digraph, where each token 223 is a node, and the weighted edge between them holds their 224 respective distance. In the event of multiple distances between 225 two node, assumes the minimum of the list. 228 """ Using the token graph, a Hamiltonian path (cycle if [0] 229 connected to [-1]) is found for the graph, wherein the ordering gives 230 the shortest path connecting each node, visited once. This gives a 231 sufficiently ordered list for the encoding values. 241 Creates graph using the (basis) tokens as nodes, and the pairwise 242 distances between them as weighted edges. Used to determine optimal 243 ordering of token adjacency for later encoding. 247 assert( callable(graph_type) )
249 token_graph = graph_type()
251 for tokens_tuple, distances
in token_dist_pairs.items():
253 if not token_graph.has_node(tokens_tuple[0]):
254 token_graph.add_node(tokens_tuple[0])
255 if not token_graph.has_node(tokens_tuple[1]):
256 token_graph.add_node(tokens_tuple[1])
260 if graph_type == nx.MultiGraph:
262 if not isinstance(distances, int):
264 token_graph.add_edge( tokens_tuple[0], tokens_tuple[1], weight=d )
266 token_graph.add_edge( tokens_tuple[0], tokens_tuple[1], weight=distances )
269 d_val = np.min(distances)
if not isinstance(distances, int)
else distances
270 token_graph.add_edge( tokens_tuple[0], tokens_tuple[1], weight=d_val )
272 if graph_type == nx.DiGraph:
273 token_graph.add_edge( tokens_tuple[1], tokens_tuple[0], weight=d_val )
281 Solves the Hamiltonian cycle problem to define the ordering of the basis tokens. 282 If a cycle is not required, can solve the TSP problem instead (ham_cycle = False). 285 assert( isinstance(token_graph, nx.DiGraph) )
287 assert( nx.tournament.is_strongly_connected(token_graph) )
289 return nx.tournament.hamiltonian_path(token_graph)
292 return token_order[:]
298 Using or-tools to solve TSP of token_graph. 299 Adapted from or-tools examples on TSP 302 dist_dat[
'distance_matrix'] = nx.adjacency_matrix(token_graph).todense().tolist()
303 dist_dat[
'num_vehicles'] = 1
304 dist_dat[
'depot'] = 0
306 mgr = pywrapcp.RoutingIndexManager(
307 len(dist_dat[
'distance_matrix']),
308 dist_dat[
'num_vehicles'], dist_dat[
'depot']
310 routing = pywrapcp.RoutingModel(mgr)
312 def dist_matrix_val(idx_0, idx_1):
313 """Returns value from distance matrix between nodes""" 314 n0 = mgr.IndexToNode(idx_0)
315 n1 = mgr.IndexToNode(idx_1)
316 return dist_dat[
'distance_matrix'][n0][n1]
318 transit_callback_index = routing.RegisterTransitCallback(dist_matrix_val)
320 routing.SetArcCostEvaluatorOfAllVehicles(transit_callback_index)
323 search_parameters = pywrapcp.DefaultRoutingSearchParameters()
324 search_parameters.first_solution_strategy = (
325 routing_enums_pb2.FirstSolutionStrategy.PATH_CHEAPEST_ARC)
328 assignment = routing.SolveWithParameters(search_parameters)
330 dist_total = assignment.ObjectiveValue()
332 index = routing.Start(0)
335 token_list = list(token_graph)
337 while not routing.IsEnd(index):
338 token_order.append(token_list[mgr.IndexToNode(index)])
339 previous_index = index
340 index = assignment.Value(routing.NextVar(index))
341 return token_order, dist_total
347 for idx
in range(1, len(token_order_list)):
349 v0 = self.
distance_dictionary.get(token_type).get( ( token_order_list[idx-1], token_order_list[idx] ) )
350 v1 = self.
distance_dictionary.get(token_type).get( ( token_order_list[idx], token_order_list[idx-1] ) )
352 sum_total.append( np.min(v1) )
354 sum_total.append( np.min(v0) )
356 return (np.sum(sum_total), sum_total)
361 """ 5. Encode the ordered tokens using a code based on indexed 362 location. Values close together will have fewer bit flips. 367 t_dict.update({token : self.
encoder.encode(idx, token_type) })
378 X = np.zeros([len(a.tokens[
'verbs'])]*2)
381 tup_map_f = {k:i
for i, k
in enumerate(self.
tokens[
'verbs'].keys()) }
382 tup_map_i = {i:k
for i, k
in enumerate(self.
tokens[
'verbs'].keys()) }
385 "Calculate cumulative path length of resulting basis ordering"
def load_tokens(self, corpus_path, mode=0, stop_words=True, use_spacy=False)
def _calc_token_order_distance(self, token_order_list, token_type)
def sort_basis_helper(self, token_type, num_elems)
def _create_token_graph(self, token_dist_pairs, graph_type=nx.DiGraph)
def define_basis(self, num_basis={"verbs":8, "nouns":8})
def __init__(self, corpus_path="", mode=0, stop_words=True, encoder=enc.gray.GrayEncoder(), use_spacy=False)
def _get_ordered_tokens(self, nx.DiGraph token_graph, ham_cycle=True)
def assign_indexing(self, token_type)
def remove_stopwords(text, sw)
def tokenize_corpus(corpus, proc_mode=0, stop_words=True)
def getPathLength(self, token_type)
def calc_diff_matrix(self)
def sort_basis_tokens_by_dist(self, tokens_type, graph_type=nx.DiGraph, dist_metric=lambda x, np.abs(x[:, np.newaxis] - y) y, num_basis=16, ham_cycle=True)
def tokenize_corpus(self, corpus, proc_mode=0, stop_words=True, use_spacy=False)
def _tsp_token_solver(self, nx.DiGraph token_graph)
def _get_token_position(self, tagged_tokens, token_type)
def sort_tokens_by_dist(self, tokens_type, graph_type=nx.DiGraph, dist_metric=lambda x, np.abs(x[:, np.newaxis] - y) y)