6 vsm_verbs = VectorSpaceModel.VectorSpaceModel("file.txt")     7 vsm_verbs.sort_tokens_by_dist("verbs")     8 vsm_verbs.assign_indexing()    13 from spacy.tokenizer 
import Tokenizer
    14 from spacy.lang.en 
import English
    24 from ortools.constraint_solver 
import routing_enums_pb2
    25 from ortools.constraint_solver 
import pywrapcp
    33         Rewrite of pc.tokenize_corpus to allow for tracking of basis word     34         positions in list to improve later pairwise distance calculations.    41         if use_spacy == 
False:
    42             token_sents = self.
pc.nltk.sent_tokenize(corpus) 
    45                 tk = self.
pc.nltk.word_tokenize(s)
    46                 if stop_words == 
False:
    48                 token_words.extend(tk)
    49                 tags.extend(self.
pc.nltk.pos_tag(tk))
    53                     s = self.
pc.nltk.SnowballStemmer(
'english', ignore_stopwords = 
not stop_words)
    54                     token_words = [s.stem(t) 
for t 
in token_words]
    55                 elif proc_mode == 
'l':
    56                     wnl = self.
pc.nltk.WordNetLemmatizer()
    57                     token_words = [wnl.lemmatize(t) 
for t 
in token_words]
    59             tagged_tokens = self.
pc.nltk.pos_tag(token_words)
    63             spacy_pos_tagger = spacy.load(
"en_core_web_sm")
    65             for s 
in spacy_pos_tagger(corpus):
    66                 if stop_words == 
False and s.is_stop:
    72                             raise Exception(
"Stemming not currently supported by spacy")
    73                         elif proc_mode == 
'l':
    76                     text_val = text_val.lower()
    77                     token_words.append(text_val)
    78                     tags.append((text_val, s.pos_))
    84         count_nouns = { k:(v.size,v) 
for k,v 
in nouns.items()}
    85         count_verbs = { k:(v.size,v) 
for k,v 
in verbs.items()}
    87         return {
'verbs':count_verbs, 
'nouns':count_nouns, 
'tk_sentence':token_sents, 
'tk_words':token_words}
    90         """ Tracks the positions where a tagged element is found in     91         the tokenised corpus list. Useful for comparing distances.    92         If the key doesn't initially exist, it adds a list with a     93         single element. Otherwise, extends the list with the new     97         for pos, token 
in enumerate(tagged_tokens):
    98             if pc.tg.matchables(token_type, token[1]):
    99                 if isinstance(token_dict.get(token[0]), type(
None)):
   100                     token_dict.update( { token[0] : np.array([pos])} )
   102                     token_dict.update( { token[0] : np.append(token_dict.get(token[0]), pos) } )
   110     Use vector space model of meaning to determine relative order of tokens in basis (see disco papers).   113     -   1. Populate set of tokens of type t in corpus; label as T; O(n)   114     -   2. Choose n top most occurring tokens from T, and define as basis elements of type t; 2 <= n <= |T|; O(n)   115     -   3. Find relative (pairwise) distances between these tokens, and define as metric; n(n-1)/2 -> O(n^2)   116     -   4. Sort tokens by distance metric; O(n*log(n))   117     -   5. Assign tokens integer ID using Gray code mapping of sorted index; O(n)   119     After the aboves steps the elements are readily available for encoding using the respective ID. Tokens that appear   120     relatively close together will be closer in the sorted list, and as a result have a smaller number of bit flips of difference   121     which can be used in the Hamming distance calculation later for similarity of meanings.   123     def __init__(self, corpus_path="", mode=0, stop_words=True, encoder=enc.gray.GrayEncoder(), use_spacy=
False):
   134     def load_tokens(self, corpus_path, mode=0, stop_words=True, use_spacy=False):
   135         " 1. Wraps the calls from process_corpus.py to tokenize. Returns None if path is "   136         if path.exists(corpus_path):
   137             return self.
pc.
tokenize_corpus( pc.load_corpus(corpus_path), mode, stop_words, use_spacy)
   139             raise Exception(
"No corpus found at the given path.")
   145         basis_dict = {token_type : {} }
   147         for counter, elem 
in enumerate(sorted(self.
tokens[token_type].items(), key=
lambda x : x[1][0], reverse = 
True)):
   148             if counter >= int(num_elems):
   150             basis_dict[token_type].update({elem[0] : elem[1]})
   157         """ 2. Specify the number of basis elements in each space.    158         Dict holds keys of type and values of number of elements to request."""   162         for t 
in (
"nouns", 
"verbs"):
   171     def sort_tokens_by_dist(self, tokens_type, graph_type = nx.DiGraph, dist_metric = lambda x,y : np.abs(x[:, np.newaxis] - y) ):
   173         tk_list = list(self.
tokens[tokens_type].keys())
   176         for c0,k0 
in enumerate(tk_list[0:-1]):
   177             for k1 
in tk_list[c0:]:
   179                     a = self.
tokens[tokens_type][k0][1]
   180                     b = self.
tokens[tokens_type][k1][1]
   182                     dist_dict[(k0,k1)] = np.min(dist_metric(a,b)) 
   187         """ Maps the tokens into a fully connected digraph, where each token    188         is a node, and the weighted edge between them holds their    189         respective distance. In the event of multiple distances between    190         two node, assumes the minimum of the list.   193         """ Using the token graph, a Hamiltonian path (cycle if [0]    194         connected to [-1]) is found for the graph, wherein the ordering gives   195         the shortest path connecting each node, visited once. This gives a    196         sufficiently ordered list for the encoding values.   202     def sort_basis_tokens_by_dist(self, tokens_type, graph_type = nx.DiGraph, dist_metric = lambda x,y : np.abs(x[:, np.newaxis] - y), num_basis = 16, ham_cycle = 
True):
   204         basis_tk_list = list(self.
define_basis(num_basis={
"verbs":num_basis, 
"nouns" : num_basis})[tokens_type].keys())
   208         if len(basis_tk_list) > 1:
   209             for c0,k0 
in enumerate(basis_tk_list[0:]):
   210                 for k1 
in basis_tk_list[c0:]:
   212                         a = self.
tokens[tokens_type][k0][1]
   213                         b = self.
tokens[tokens_type][k1][1]
   215                         basis_dist_dict[(k0,k1)] = np.min(dist_metric(a,b)) 
   218             basis_dist_dict.update({ (basis_tk_list[0],basis_tk_list[0]) : 0})
   222         """ Maps the tokens into a fully connected digraph, where each token    223         is a node, and the weighted edge between them holds their    224         respective distance. In the event of multiple distances between    225         two node, assumes the minimum of the list.   228         """ Using the token graph, a Hamiltonian path (cycle if [0]    229         connected to [-1]) is found for the graph, wherein the ordering gives   230         the shortest path connecting each node, visited once. This gives a    231         sufficiently ordered list for the encoding values.   241         Creates graph using the (basis) tokens as nodes, and the pairwise    242         distances between them as weighted edges. Used to determine optimal    243         ordering of token adjacency for later encoding.   247         assert( callable(graph_type) )
   249         token_graph = graph_type()
   251         for tokens_tuple, distances 
in token_dist_pairs.items():
   253             if not token_graph.has_node(tokens_tuple[0]):
   254                 token_graph.add_node(tokens_tuple[0])
   255             if not token_graph.has_node(tokens_tuple[1]):
   256                 token_graph.add_node(tokens_tuple[1])
   260             if graph_type == nx.MultiGraph:
   262                 if not isinstance(distances, int):
   264                         token_graph.add_edge( tokens_tuple[0], tokens_tuple[1], weight=d )
   266                     token_graph.add_edge( tokens_tuple[0], tokens_tuple[1], weight=distances ) 
   269                 d_val = np.min(distances) 
if not isinstance(distances, int) 
else distances
   270                 token_graph.add_edge( tokens_tuple[0], tokens_tuple[1], weight=d_val )
   272                 if graph_type == nx.DiGraph:
   273                     token_graph.add_edge( tokens_tuple[1], tokens_tuple[0], weight=d_val )
   281         Solves the Hamiltonian cycle problem to define the ordering of the basis tokens.   282         If a cycle is not required, can solve the TSP problem instead (ham_cycle = False).   285         assert( isinstance(token_graph, nx.DiGraph) )
   287         assert( nx.tournament.is_strongly_connected(token_graph) )
   289             return nx.tournament.hamiltonian_path(token_graph)
   292             return token_order[:] 
   298         Using or-tools to solve TSP of token_graph.   299         Adapted from or-tools examples on TSP   302         dist_dat[
'distance_matrix'] = nx.adjacency_matrix(token_graph).todense().tolist() 
   303         dist_dat[
'num_vehicles'] = 1 
   304         dist_dat[
'depot'] = 0 
   306         mgr =   pywrapcp.RoutingIndexManager(
   307                     len(dist_dat[
'distance_matrix']),
   308                     dist_dat[
'num_vehicles'], dist_dat[
'depot']
   310         routing = pywrapcp.RoutingModel(mgr)
   312         def dist_matrix_val(idx_0, idx_1):
   313             """Returns value from distance matrix between nodes"""   314             n0 = mgr.IndexToNode(idx_0)
   315             n1 = mgr.IndexToNode(idx_1)
   316             return dist_dat[
'distance_matrix'][n0][n1]
   318         transit_callback_index = routing.RegisterTransitCallback(dist_matrix_val)
   320         routing.SetArcCostEvaluatorOfAllVehicles(transit_callback_index)
   323         search_parameters = pywrapcp.DefaultRoutingSearchParameters()
   324         search_parameters.first_solution_strategy = (
   325             routing_enums_pb2.FirstSolutionStrategy.PATH_CHEAPEST_ARC)
   328         assignment = routing.SolveWithParameters(search_parameters)
   330         dist_total = assignment.ObjectiveValue()
   332         index = routing.Start(0)
   335         token_list = list(token_graph)
   337         while not routing.IsEnd(index):
   338             token_order.append(token_list[mgr.IndexToNode(index)])
   339             previous_index = index
   340             index = assignment.Value(routing.NextVar(index))
   341         return token_order, dist_total
   347         for idx 
in range(1, len(token_order_list)):
   349             v0 = self.
distance_dictionary.get(token_type).get( ( token_order_list[idx-1], token_order_list[idx] ) )
   350             v1 = self.
distance_dictionary.get(token_type).get( ( token_order_list[idx], token_order_list[idx-1] ) )
   352                 sum_total.append( np.min(v1) )
   354                 sum_total.append( np.min(v0) )
   356         return (np.sum(sum_total), sum_total)
   361         """ 5. Encode the ordered tokens using a code based on indexed    362         location. Values close together will have fewer bit flips.   367             t_dict.update({token : self.
encoder.encode(idx, token_type) })
   378         X = np.zeros([len(a.tokens[
'verbs'])]*2)
   381         tup_map_f = {k:i 
for i, k 
in enumerate(self.
tokens[
'verbs'].keys()) }
   382         tup_map_i = {i:k 
for i, k 
in enumerate(self.
tokens[
'verbs'].keys()) }
   385         "Calculate cumulative path length of resulting basis ordering" 
def load_tokens(self, corpus_path, mode=0, stop_words=True, use_spacy=False)
 
def _calc_token_order_distance(self, token_order_list, token_type)
 
def sort_basis_helper(self, token_type, num_elems)
 
def _create_token_graph(self, token_dist_pairs, graph_type=nx.DiGraph)
 
def define_basis(self, num_basis={"verbs":8, "nouns":8})
 
def __init__(self, corpus_path="", mode=0, stop_words=True, encoder=enc.gray.GrayEncoder(), use_spacy=False)
 
def _get_ordered_tokens(self, nx.DiGraph token_graph, ham_cycle=True)
 
def assign_indexing(self, token_type)
 
def remove_stopwords(text, sw)
 
def tokenize_corpus(corpus, proc_mode=0, stop_words=True)
 
def getPathLength(self, token_type)
 
def calc_diff_matrix(self)
 
def sort_basis_tokens_by_dist(self, tokens_type, graph_type=nx.DiGraph, dist_metric=lambda x, np.abs(x[:, np.newaxis] - y) y, num_basis=16, ham_cycle=True)
 
def tokenize_corpus(self, corpus, proc_mode=0, stop_words=True, use_spacy=False)
 
def _tsp_token_solver(self, nx.DiGraph token_graph)
 
def _get_token_position(self, tagged_tokens, token_type)
 
def sort_tokens_by_dist(self, tokens_type, graph_type=nx.DiGraph, dist_metric=lambda x, np.abs(x[:, np.newaxis] - y) y)