QNLP  v1.0
QNLP_Overlap.py
Go to the documentation of this file.
1 #!/usr/bin/env python
2 # coding: utf-8
3 
4 # # Calculating sentence similarity using a hybrid classical-quantum workflow
5 #
6 # ### Lee J. O'Riordan and Myles Doyle
7 #
8 
9 # This notebook will serve to demonstrate a mixed hybrid workflow to examine
10 # relationships between words, and respective sentence meanings using classical
11 # and quantum information processing techniques.
12 #
13 # Our goal is to analyse a corpus, extract key features rom it, and represent
14 # those features in a quantum-compatible notation. We proceed to encide these
15 # features into a quantum simulation environment using Intel-QS (formerly qHiPSTER),
16 # and query for similarities using the encoded quantum states. For this, we have
17 # implemented various encoding and analysis strategies. Our primary method for
18 # similarity uses a Hamming distance approach, wherein we apply a simplified
19 # digital-to-analogue encoding strategy to allow the results to be obtained via
20 # measurement.
21 
22 # We must load the Python QNLP packages & module (`QNLP`), and the C++-Python
23 # bound simulator backend and algorithms (`PyQNLPSimulator`).
24 
25 from mpi4py import MPI
26 from PyQNLPSimulator import PyQNLPSimulator as p
27 import QNLP as q
28 import numpy as np
29 from QNLP import DisCoCat
30 from QNLP.encoding import simple
31 from QNLP.proc.VerbGraph import VerbGraph as vg
32 from QNLP.proc.HammingDistance import HammingDistance
33 import networkx as nx
34 from tqdm import tqdm
35 import sys
36 import os
37 
38 from itertools import product
39 import tempfile
40 
41 comm = MPI.COMM_WORLD
42 rank = comm.Get_rank()
43 try:
44  NUM_BASIS_NOUN = int(os.environ['NUM_BASIS_NOUN'])
45  NUM_BASIS_VERB = int(os.environ['NUM_BASIS_VERB'])
46 
47  BASIS_NOUN_DIST_CUTOFF = int(os.environ['BASIS_NOUN_DIST_CUTOFF'])
48  BASIS_VERB_DIST_CUTOFF = int(os.environ['BASIS_VERB_DIST_CUTOFF'])
49 
50  VERB_NOUN_DIST_CUTOFF = int(os.environ['VERB_NOUN_DIST_CUTOFF'])
51 
52 except KeyError as e:
53 
54  NUM_BASIS_NOUN = 2
55  NUM_BASIS_VERB = 2
56  BASIS_NOUN_DIST_CUTOFF = 2
57  BASIS_VERB_DIST_CUTOFF = 2
58  VERB_NOUN_DIST_CUTOFF = 2
59 
60 
61 # Next, we load the corpus file using the vector-space model, defined in the
62 # `VectorSpaceModel` class, specifying the mode of tagging, and whether to
63 # filter out stop-words. For this notebook I have used the Project Gutenberg
64 # [https://www.gutenberg.org/] edition of `Alice in Wonderland`, with simple
65 # replacements to avoid incorrect tagging of elements (mostly standardising
66 # apostrophes to \' and quotations to \").
67 
68 if rank == 0:
69  s_encoder = simple.SimpleEncoder(num_nouns=NUM_BASIS_NOUN, num_verbs=NUM_BASIS_VERB)
70  assert (len(sys.argv) > 1)
71  corpus_file=sys.argv[1]
72  if not os.path.isfile(corpus_file):
73  print ("Error: Inputted file does not exist")
74  sys.exit()
75  vsm = q.VectorSpaceModel.VectorSpaceModel(
76  corpus_path=corpus_file,
77  mode="l",
78  stop_words=True,
79  encoder = s_encoder,
80  use_spacy=True
81  )
82 
83 # From here we can specify the number of basis elements by occurrence in the
84 # corpus. This will take the `num_basis_elems` most frequently occurring tokens
85 # in both verb and noun spaces respectively.
86  basis = vsm.define_basis({'verbs' : NUM_BASIS_VERB, 'nouns' : NUM_BASIS_NOUN})
87 
88 # Next, we aim to sort the mapping of the basis tokens to a binary
89 # representation for state encoding. By building a graph, and aiming to solve
90 # the Hamiltonian cycle problem, we can ensure that the tokens are ordered such that any
91 # nearest-neighbours have the shortest paths. This becomes necessary later when
92 # encoding information, as any incorrect bit-flips (potentially
93 # offered by noise), should still allow some similarity to be represented. By
94 # ordering the tokens by their minimum path lengths we may maintain closeness
95 # in the presence of errors.
96 
97  verb_dist = vsm.sort_basis_tokens_by_dist("verbs", num_basis=NUM_BASIS_VERB)
98  noun_dist = vsm.sort_basis_tokens_by_dist("nouns", num_basis=NUM_BASIS_NOUN)
99 
100 # We now take the previously defined basis elements, and attempt to arrange
101 # them such that the distance between nearest-neighbours in the corpus is
102 # minimised. With this, we assign a Gray code value to the basis. This allows
103 # us to ensure values closer together in the corpus have a shorter Hamming
104 # distance between them. We make use of the DisCo-inspired compositional model,
105 # wherein the distances between words dictates their closeness; we use these
106 # distances to define the edge-weights between nodes (tokens), and hence by
107 # defining the problem as a TSP, we can find an ordering that ensures Hamming
108 # distance relates directly to closeness of words.
109 
110  vsm.assign_indexing("nouns");
111  vsm.assign_indexing("verbs");
112 
113 
114 # With the basis tokens correctly ordered, we may now map the other respective
115 # elements in their space onto these bases. We aim to map the nouns onto the
116 # noun basis, and likewise for the verbs. This allows us to represent any other
117 # word in the corpus in the given basis. We may encode arbitrary superposition
118 # states to represent the tokens, which can be ascribed as a vector-space
119 # representation model using quantum states.
120 #
121 # For this, we begin by creating a `DisCoCat` object, and use to perform the
122 # mappings of basis tokens.
123 
124  dcc = DisCoCat.DisCoCat()
125  mapping_verbs = dcc.map_to_basis(vsm.tokens['verbs'] , verb_dist['verbs'], basis_dist_cutoff=BASIS_VERB_DIST_CUTOFF)
126  mapping_nouns = dcc.map_to_basis(vsm.tokens['nouns'] , noun_dist['nouns'], basis_dist_cutoff=BASIS_NOUN_DIST_CUTOFF)
127 
128 # For the above data, the meanings of the composite nouns `hall` and `table` can be represented as:
129 #
130 # $$\begin{array}{ll}
131 # \vert \textrm{hall} \rangle &= a_0\vert \textrm{round} \rangle + a_1\vert \textrm{Rabbit} \rangle + a_2\vert \textrm{head} \rangle + a_3\vert \textrm{way} \rangle + a_4\vert \textrm{time} \rangle + a_5\vert \textrm{Alice} \rangle, \\
132 # \vert \textrm{table} \rangle &= b_{0}\vert \textrm{March} \rangle + b_{1}\vert \textrm{tone} \rangle + b_{2}\vert \textrm{round} \rangle + b_{3}\vert \textrm{nothing} \rangle + b_{4}\vert \textrm{Hare} \rangle + b_{5}\vert \textrm{things} \rangle + b_{6}\vert \textrm{thing} \rangle + b_{7}\vert \textrm{way} \rangle + b_{8}\vert \textrm{King} \rangle + b_{9}\vert \textrm{time} \rangle + b_{10}\vert \textrm{Alice} \rangle,
133 # \end{array}
134 # $$
135 #
136 # where we assume each item in the basis set has an orthogonal quantum state
137 # representing it. For the given 32-numbered bases, we can assume the
138 # coefficient of any unlisted state is represented as zero.
139 
140 # With nouns mappable to the noun basis set, and verbs mappable to the verbs
141 # basis set, we may now examine the composite noun-verb-noun sentence
142 # structures. This involves a similar approach to compositional mapping of
143 # token to token-basis, wherein composite words close together can be
144 # considered within the same NVN sentnence, and thus present themselves as a
145 # constructable entity for binary-string quantum state encoding.
146 #
147 # The following pair-wise relations are required for a fully generalised solution:
148 #
149 # $$
150 # \begin{equation*}
151 # \begin{array}{l|c|c|c}
152 # & \textbf{Dataset 1} & \textbf{Dataset 2} & \textbf{Use} \\
153 # \hline
154 # 1.&\textrm{noun basis} & \textrm{noun basis} & \textrm{noun basis binary ordering} \\
155 # 2.&\textrm{noun composite} & \textrm{noun basis} & \textrm{noun representative meaning} \\
156 # 3.&\textrm{noun composite} & \textrm{noun composite} & \textrm{inter-noun relationships} \\
157 # \hline
158 # 4.&\textrm{verb basis} & \textrm{verb basis} & \textrm{verb basis binary ordering} \\
159 # 5.&\textrm{verb composite} & \textrm{verb basis} & \textrm{verb representative meaning} \\
160 # 6.&\textrm{verb composite} & \textrm{verb compsite} & \textrm{inter-verb relationships} \\
161 # \hline
162 # 7.&\textrm{noun basis} & \textrm{verb basis} & \textrm{compiled bit-strings for encoding} \\
163 # 8.&\textrm{noun composite} & \textrm{verb composite} & \textrm{compositional meaning for bit-string generation} \\
164 # \end{array}
165 # \end{equation*}
166 # $$
167 
168 # Additionally, one may add a more complex parameter exploration space by
169 # customising the tagging options; here the use of lemmatisation `(mode="l")`,
170 # stemming `(mode="s")`, or default tagging options `(mode=None)`, results in
171 # a different set of words in the above graphs.
172 
173 # From here, we define our encoding and decoding dictionaries.
174 
175 # Define basis tokens encoding and decoding dicts
176  encoding_dict = {"ns" : vsm.encoded_tokens["nouns"],
177  "v" : vsm.encoded_tokens["verbs"],
178  "no" : vsm.encoded_tokens["nouns"]
179  }
180 
181  decoding_dict = {"ns" : { v:k for k,v in encoding_dict["ns"].items() },
182  "v" : { v:k for k,v in encoding_dict["v"].items() },
183  "no" : { v:k for k,v in encoding_dict["no"].items() }
184  }
185 
186 # With the above information, we can now determine the required resources to
187 # store our data in a qubit register.
188 
189 # Register must be large enough to support 2*|nouns| + |verbs| in given encoding
190  len_reg_memory = q.encoding.utils.pow2bits( int(np.max( list(encoding_dict['v'].values()))) )[1] + \
191  q.encoding.utils.pow2bits( int(np.max( list(encoding_dict['no'].values()))) )[1] + \
192  q.encoding.utils.pow2bits( int(np.max( list(encoding_dict['ns'].values()))) )[1]
193 
194  len_reg_aux = len_reg_memory + 2
195  num_qubits = len_reg_memory + len_reg_aux
196 
197  print("""{}
198 Requires {} qubits to encode data using {}
199 noun and {} verb basis elements, allowing a
200 maximum of {} unique patterns.
201 {}
202 """.format("#"*48, num_qubits, NUM_BASIS_NOUN, NUM_BASIS_VERB, (NUM_BASIS_NOUN**2)*NUM_BASIS_VERB, "#"*48)
203  )
204 
205 # Using encoded bitstrings for bases, look-up mapping terms for composite nouns
206 # and verbs, create bitstrings and generate quantum states.
207 
208  corpus_list_n = vsm.tokens['nouns']
209  corpus_list_v = vsm.tokens['verbs']
210  dist_cutoff = BASIS_VERB_DIST_CUTOFF
211 
212  v_list = vg.calc_verb_noun_pairings(corpus_list_v, corpus_list_n, VERB_NOUN_DIST_CUTOFF)
213 
214  sentences = []
215 
216  for v in v_list:
217  for i in v.lr_nouns.keys(): #product(v.left_nouns, [v.verb], v.right_nouns):
218  #ns,s,no = mapping_nouns[i[0]], mapping_verbs[i[1]], mapping_nouns[i[2]]
219  #if v.left_nouns != None and v.right_nouns != None:
220  if mapping_nouns[i[0]] != None and mapping_verbs[v.verb] != None and mapping_nouns[i[1]] != None:
221  sentences.append(
222  [ {i[0] : [encoding_dict['ns'][k] for k in mapping_nouns[i[0]].keys()] },
223  {v.verb : [encoding_dict['v'][k] for k in mapping_verbs[v.verb].keys()] },
224  {i[1] : [encoding_dict['no'][k] for k in mapping_nouns[i[1]].keys()] }
225  ]
226  )
227  sentences
228  print("Sentences matching noun-verb-noun structure captured as:", sentences)
229 
230  # Set up registers to store indices
231  # Keeping aux register and control registers in these positions
232  # to reduce overhead during encoding stages. ~25% faster than data-aux-control
233  reg_memory = [0]*len_reg_memory;
234  for i in range(len_reg_memory):
235  reg_memory[i] = i + len_reg_aux
236 
237  reg_aux = [0]*len_reg_aux
238  for i in range(len_reg_aux-2):
239  reg_aux[i] = i + 2
240  reg_aux[-2] = 0
241  reg_aux[-1] = 1
242  print("REG_MEM=",reg_memory)
243  print("REG_AUX=",reg_aux)
244 
245 #Create list for the patterns to be encoded
246  vec_to_encode = []
247 
248 # Generate bit-patterns from sentences and store in vec_to_encode
249 
250  for idx,sentence in enumerate(sentences):
251  superpos_patterns = list( product( list(sentence[0].values())[0], list(sentence[1].values())[0], list(sentence[2].values())[0] ) )
252  # Generate all combinations of the bit-patterns for superpos states
253  for patt in superpos_patterns:
254  num = q.utils.encode_binary_pattern_direct(patt, encoding_dict)
255  vec_to_encode.extend([num])
256 
257 #Need to remove duplicates
258  vec_to_encode = list(set(vec_to_encode))
259  vec_to_encode.sort()
260 
261  d ={"sentences" : len(sentences),
262  "patterns" : len(vec_to_encode),
263  "NUM_BASIS_NOUN" : NUM_BASIS_NOUN,
264  "NUM_BASIS_VERB" : NUM_BASIS_VERB,
265  "BASIS_NOUN_DIST_CUTOFF" : BASIS_NOUN_DIST_CUTOFF,
266  "BASIS_VERB_DIST_CUTOFF" : BASIS_VERB_DIST_CUTOFF,
267  "VERB_NOUN_DIST_CUTOFF" : VERB_NOUN_DIST_CUTOFF
268  }
269  print(d)
270  print("Encoding data:", vec_to_encode)
271 
272  # Counter for experiment results; key exists for each potentially unique outcome defined by encoding
273  shot_counter = {}
274 
275  for i in vec_to_encode:
276  shot_counter.update({i : 0})
277 
278 # from IPython import embed; embed()
279 
280 else:
281  reg_memory = None
282  reg_aux = None
283  len_reg_memory = None
284  vec_to_encode = None
285  shot_counter = None
286  encoding_dict = None
287  bit_shifts = None
288 
289 reg_memory = comm.bcast(reg_memory, root=0)
290 reg_aux = comm.bcast(reg_aux, root=0)
291 vec_to_encode = comm.bcast(vec_to_encode, root=0)
292 shot_counter = comm.bcast(shot_counter, root=0)
293 encoding_dict = comm.bcast(encoding_dict, root=0)
294 
295 num_qubits = len(reg_memory) + len(reg_aux)
296 
297 #Explicitly disable fusion as it can cause incorrect results
298 use_fusion = False
299 
300 if os.environ.get('RESOURCE_EST') is not None:
301  print("Overriding default qubit count")
302  num_qubits = 1
303 
304 sim1 = p(num_qubits, use_fusion)
305 sim2 = p(num_qubits, use_fusion)
306 normalise = True
307 
308 if rank == 0:
309  try:
310  num_exps = int(sys.argv[2])
311  except Exception as e:
312  num_exps = 10
313 
314 else:
315  num_exps = None
316 
317 num_exps=comm.bcast(num_exps, root=0)
318 
319 num_faults = 0
320 
321 # Use all potential string patterns as test
322 test_strings = list(product(encoding_dict["ns"].values(), encoding_dict["v"].values(), encoding_dict["no"].values()))
323 
324 
325 for i_ts1,ts1 in enumerate(test_strings[:-1]):
326  test_pattern1 = q.utils.encode_binary_pattern_direct(ts1, encoding_dict)
327  if test_pattern1 in vec_to_encode:
328  if rank == 0:
329  print("Pattern {} already exists. Skipping".format(test_pattern1))
330  sys.stdout.flush()
331  continue
332 
333  for i_ts2,ts2 in enumerate(test_strings[i_ts1+1:]):
334  test_pattern2 = q.utils.encode_binary_pattern_direct(ts2, encoding_dict)
335 
336  if test_pattern2 in vec_to_encode:
337  if rank == 0:
338  print("Pattern {} already exists. Skipping encoding of {} and {}".format(test_pattern2, test_pattern1, test_pattern2))
339  sys.stdout.flush()
340  continue
341 
342  sim1.initRegister()
343  sim2.initRegister()
344 
345  # Encode
346  sim1.encodeBinToSuperpos_unique(reg_memory, reg_aux, vec_to_encode, len(reg_memory))
347  sim2.encodeBinToSuperpos_unique(reg_memory, reg_aux, vec_to_encode, len(reg_memory))
348 
349 # Compute Hamming distance between test pattern and encoded patterns
350  sim1.applyHammingDistanceRotY(test_pattern1, reg_memory, reg_aux, len(reg_memory))
351  sim2.applyHammingDistanceRotY(test_pattern2, reg_memory, reg_aux, len(reg_memory))
352 
353  # Measure
354  sim1.collapseToBasisZ(reg_aux[len(reg_aux)-2], 1)
355  sim2.collapseToBasisZ(reg_aux[len(reg_aux)-2], 1)
356 
357  val = sim1.overlap(sim2)
358 
359  if rank==0:
360  print("|<{}|{}>|^2,|<{}|{}>|^2,{}".format(test_pattern1, test_pattern2, test_string1, test_string2, np.abs(val)**2))
361  sys.stdout.flush()
362 
363 MPI.Finalize()
364 exit()