QNLP  v1.0
QNLP_EndToEnd_MPI.py
Go to the documentation of this file.
1 #!/usr/bin/env python
2 # coding: utf-8
3 
4 # # Calculating sentence similarity using a hybrid classical-quantum workflow
5 #
6 # ### Lee J. O'Riordan and Myles Doyle
7 #
8 
9 # This notebook will serve to demonstrate a mixed hybrid workflow to examine
10 # relationships between words, and respective sentence meanings using classical
11 # and quantum information processing techniques.
12 #
13 # Our goal is to analyse a corpus, extract key features rom it, and represent
14 # those features in a quantum-compatible notation. We proceed to encide these
15 # features into a quantum simulation environment using Intel-QS (formerly qHiPSTER),
16 # and query for similarities using the encoded quantum states. For this, we have
17 # implemented various encoding and analysis strategies. Our primary method for
18 # similarity uses a Hamming distance approach, wherein we apply a simplified
19 # digital-to-analogue encoding strategy to allow the results to be obtained via
20 # measurement.
21 
22 # We must load the Python QNLP packages & module (`QNLP`), and the C++-Python
23 # bound simulator backend and algorithms (`PyQNLPSimulator`).
24 
25 from mpi4py import MPI
26 from PyQNLPSimulator import PyQNLPSimulator as p
27 import QNLP as q
28 import numpy as np
29 from QNLP import DisCoCat
30 from QNLP.encoding import simple
31 from QNLP.proc.VerbGraph import VerbGraph as vg
32 from QNLP.proc.HammingDistance import HammingDistance
33 import networkx as nx
34 from tqdm import tqdm
35 import sys
36 import os
37 
38 from itertools import product
39 import tempfile
40 
41 comm = MPI.COMM_WORLD
42 rank = comm.Get_rank()
43 try:
44  NUM_BASIS_NOUN = int(os.environ['NUM_BASIS_NOUN'])
45  NUM_BASIS_VERB = int(os.environ['NUM_BASIS_VERB'])
46 
47  BASIS_NOUN_DIST_CUTOFF = int(os.environ['BASIS_NOUN_DIST_CUTOFF'])
48  BASIS_VERB_DIST_CUTOFF = int(os.environ['BASIS_VERB_DIST_CUTOFF'])
49 
50  VERB_NOUN_DIST_CUTOFF = int(os.environ['VERB_NOUN_DIST_CUTOFF'])
51 
52 except KeyError as e:
53 
54  NUM_BASIS_NOUN = 2
55  NUM_BASIS_VERB = 2
56  BASIS_NOUN_DIST_CUTOFF = 2
57  BASIS_VERB_DIST_CUTOFF = 2
58  VERB_NOUN_DIST_CUTOFF = 2
59 
60 
61 # Next, we load the corpus file using the vector-space model, defined in the
62 # `VectorSpaceModel` class, specifying the mode of tagging, and whether to
63 # filter out stop-words. For this notebook I have tested with the Project
64 # Gutenberg [https://www.gutenberg.org/] edition of `Alice in Wonderland`,
65 # with simple replacements to avoid incorrect tagging of elements (mostly
66 # standardising apostrophes to \' and quotations to \").
67 
68 if rank == 0:
69  s_encoder = simple.SimpleEncoder(num_nouns=NUM_BASIS_NOUN, num_verbs=NUM_BASIS_VERB)
70  assert (len(sys.argv) > 1)
71 
72  corpus_file=sys.argv[1]
73 
74  if not os.path.isfile(corpus_file):
75  print ("Error: Inputted file does not exist")
76  MPI.Finalize()
77  sys.exit()
78  vsm = q.VectorSpaceModel.VectorSpaceModel(
79  corpus_path=corpus_file,
80  mode="l",
81  stop_words=True,
82  encoder = s_encoder,
83  use_spacy=True
84  )
85 
86 # From here we can specify the number of basis elements by occurrence in the
87 # corpus. This will take the `num_basis_elems` most frequently occurring tokens
88 # in both verb and noun spaces respectively.
89  basis = vsm.define_basis({'verbs' : NUM_BASIS_VERB, 'nouns' : NUM_BASIS_NOUN})
90 
91 # Next, we aim to sort the mapping of the basis tokens to a binary
92 # representation for state encoding. By building a graph, and aiming to solve
93 # the Hamiltonian cycle problem, we can ensure that the tokens are ordered such that any
94 # nearest-neighbours have the shortest paths. This becomes necessary later when
95 # encoding information, as any incorrect bit-flips (potentially
96 # offered by noise), should still allow some similarity to be represented. By
97 # ordering the tokens by their minimum path lengths we may maintain closeness
98 # in the presence of errors.
99 
100  verb_dist = vsm.sort_basis_tokens_by_dist("verbs", num_basis=NUM_BASIS_VERB)
101  noun_dist = vsm.sort_basis_tokens_by_dist("nouns", num_basis=NUM_BASIS_NOUN)
102 
103 # We now take the previously defined basis elements, and attempt to arrange
104 # them such that the distance between nearest-neighbours in the corpus is
105 # minimised. With this, we assign a Gray code value to the basis. This allows
106 # us to ensure values closer together in the corpus have a shorter Hamming
107 # distance between them. We make use of the DisCo-inspired compositional model,
108 # wherein the distances between words dictates their closeness; we use these
109 # distances to define the edge-weights between nodes (tokens), and hence by
110 # defining the problem as a TSP, we can find an ordering that ensures Hamming
111 # distance relates directly to closeness of words.
112 
113  vsm.assign_indexing("nouns");
114  vsm.assign_indexing("verbs");
115 
116 
117 # With the basis tokens correctly ordered, we may now map the other respective
118 # elements in their space onto these bases. We aim to map the nouns onto the
119 # noun basis, and likewise for the verbs. This allows us to represent any other
120 # word in the corpus in the given basis. We may encode arbitrary superposition
121 # states to represent the tokens, which can be ascribed as a vector-space
122 # representation model using quantum states.
123 #
124 # For this, we begin by creating a `DisCoCat` object, and use to perform the
125 # mappings of basis tokens.
126 
127  dcc = DisCoCat.DisCoCat()
128  mapping_verbs = dcc.map_to_basis(vsm.tokens['verbs'] , verb_dist['verbs'], basis_dist_cutoff=BASIS_VERB_DIST_CUTOFF)
129  mapping_nouns = dcc.map_to_basis(vsm.tokens['nouns'] , noun_dist['nouns'], basis_dist_cutoff=BASIS_NOUN_DIST_CUTOFF)
130 
131 # For the above data, the meanings of the composite nouns `hall` and `table` can be represented as:
132 #
133 # $$\begin{array}{ll}
134 # \vert \textrm{hall} \rangle &= a_0\vert \textrm{round} \rangle + a_1\vert \textrm{Rabbit} \rangle + a_2\vert \textrm{head} \rangle + a_3\vert \textrm{way} \rangle + a_4\vert \textrm{time} \rangle + a_5\vert \textrm{Alice} \rangle, \\
135 # \vert \textrm{table} \rangle &= b_{0}\vert \textrm{March} \rangle + b_{1}\vert \textrm{tone} \rangle + b_{2}\vert \textrm{round} \rangle + b_{3}\vert \textrm{nothing} \rangle + b_{4}\vert \textrm{Hare} \rangle + b_{5}\vert \textrm{things} \rangle + b_{6}\vert \textrm{thing} \rangle + b_{7}\vert \textrm{way} \rangle + b_{8}\vert \textrm{King} \rangle + b_{9}\vert \textrm{time} \rangle + b_{10}\vert \textrm{Alice} \rangle,
136 # \end{array}
137 # $$
138 #
139 # where we assume each item in the basis set has an orthogonal quantum state
140 # representing it. For the given 32-numbered bases, we can assume the
141 # coefficient of any unlisted state is represented as zero.
142 
143 # With nouns mappable to the noun basis set, and verbs mappable to the verbs
144 # basis set, we may now examine the composite noun-verb-noun sentence
145 # structures. This involves a similar approach to compositional mapping of
146 # token to token-basis, wherein composite words close together can be
147 # considered within the same NVN sentnence, and thus present themselves as a
148 # constructable entity for binary-string quantum state encoding.
149 #
150 # The following pair-wise relations are required for a fully generalised solution:
151 #
152 # $$
153 # \begin{equation*}
154 # \begin{array}{l|c|c|c}
155 # & \textbf{Dataset 1} & \textbf{Dataset 2} & \textbf{Use} \\
156 # \hline
157 # 1.&\textrm{noun basis} & \textrm{noun basis} & \textrm{noun basis binary ordering} \\
158 # 2.&\textrm{noun composite} & \textrm{noun basis} & \textrm{noun representative meaning} \\
159 # 3.&\textrm{noun composite} & \textrm{noun composite} & \textrm{inter-noun relationships} \\
160 # \hline
161 # 4.&\textrm{verb basis} & \textrm{verb basis} & \textrm{verb basis binary ordering} \\
162 # 5.&\textrm{verb composite} & \textrm{verb basis} & \textrm{verb representative meaning} \\
163 # 6.&\textrm{verb composite} & \textrm{verb compsite} & \textrm{inter-verb relationships} \\
164 # \hline
165 # 7.&\textrm{noun basis} & \textrm{verb basis} & \textrm{compiled bit-strings for encoding} \\
166 # 8.&\textrm{noun composite} & \textrm{verb composite} & \textrm{compositional meaning for bit-string generation} \\
167 # \end{array}
168 # \end{equation*}
169 # $$
170 
171 # Additionally, one may add a more complex parameter exploration space by
172 # customising the tagging options; here the use of lemmatisation `(mode="l")`,
173 # stemming `(mode="s")`, or default tagging options `(mode=None)`, results in
174 # a different set of words in the above graphs.
175 
176 # From here, we define our encoding and decoding dictionaries.
177 
178 # Define basis tokens encoding and decoding dicts
179  encoding_dict = {"ns" : vsm.encoded_tokens["nouns"],
180  "v" : vsm.encoded_tokens["verbs"],
181  "no" : vsm.encoded_tokens["nouns"]
182  }
183 
184  decoding_dict = {"ns" : { v:k for k,v in encoding_dict["ns"].items() },
185  "v" : { v:k for k,v in encoding_dict["v"].items() },
186  "no" : { v:k for k,v in encoding_dict["no"].items() }
187  }
188 
189 # With the above information, we can now determine the required resources to
190 # store our data in a qubit register.
191 
192 # Register must be large enough to support 2*|nouns| + |verbs| in given encoding
193  len_reg_memory = q.encoding.utils.pow2bits( int(np.max( list(encoding_dict['v'].values()))) )[1] + \
194  q.encoding.utils.pow2bits( int(np.max( list(encoding_dict['no'].values()))) )[1] + \
195  q.encoding.utils.pow2bits( int(np.max( list(encoding_dict['ns'].values()))) )[1]
196 
197  len_reg_aux = len_reg_memory + 2
198  num_qubits = len_reg_memory + len_reg_aux
199 
200  print("""{}
201 Requires {} qubits to encode data using {}
202 noun and {} verb basis elements, allowing a
203 maximum of {} unique patterns.
204 {}
205 """.format("#"*48, num_qubits, NUM_BASIS_NOUN, NUM_BASIS_VERB, (NUM_BASIS_NOUN**2)*NUM_BASIS_VERB, "#"*48)
206  )
207 
208 # Using encoded bitstrings for bases, look-up mapping terms for composite nouns
209 # and verbs, create bitstrings and generate quantum states.
210 
211  ns = []
212 
213 # verb_bits = int(np.log2(len(verb_dist['verbs'])))
214 # noun_bits = int(np.log2(len(noun_dist['nouns'])))
215 
216  bit_shifts = [i[1] for i in q.utils.get_type_offsets(encoding_dict)]
217  bit_shifts.reverse()
218 
219  bit_shifts.insert(0,0)
220  bit_shifts = np.cumsum(bit_shifts)
221  bit_shifts
222 
223  corpus_list_n = vsm.tokens['nouns']
224  corpus_list_v = vsm.tokens['verbs']
225  dist_cutoff = BASIS_VERB_DIST_CUTOFF
226 
227  v_list = vg.calc_verb_noun_pairings(corpus_list_v, corpus_list_n, VERB_NOUN_DIST_CUTOFF)
228 
229  sentences = []
230 
231  for v in v_list:
232  for i in v.lr_nouns.keys(): #product(v.left_nouns, [v.verb], v.right_nouns):
233  #ns,s,no = mapping_nouns[i[0]], mapping_verbs[i[1]], mapping_nouns[i[2]]
234  #if v.left_nouns != None and v.right_nouns != None:
235  if mapping_nouns[i[0]] != None and mapping_verbs[v.verb] != None and mapping_nouns[i[1]] != None:
236  sentences.append(
237  [ {i[0] : [encoding_dict['ns'][k] for k in mapping_nouns[i[0]].keys()] },
238  {v.verb : [encoding_dict['v'][k] for k in mapping_verbs[v.verb].keys()] },
239  {i[1] : [encoding_dict['no'][k] for k in mapping_nouns[i[1]].keys()] }
240  ]
241  )
242  sentences
243  print("Sentences matching noun-verb-noun structure captured as:", sentences)
244 
245  # Set up registers to store indices
246  # Keeping aux register and control registers in these positions
247  # to reduce overhead during encoding stages. ~25% faster than data-aux-control
248  reg_memory = [0]*len_reg_memory;
249  for i in range(len_reg_memory):
250  reg_memory[i] = i + len_reg_aux
251 
252  reg_aux = [0]*len_reg_aux
253  for i in range(len_reg_aux-2):
254  reg_aux[i] = i + 2
255  reg_aux[-2] = 0
256  reg_aux[-1] = 1
257  print("REG_MEM=",reg_memory)
258  print("REG_AUX=",reg_aux)
259 
260 #Create list for the patterns to be encoded
261  vec_to_encode = []
262 
263 # Generate bit-patterns from sentences and store in vec_to_encode
264 
265  for idx,sentence in enumerate(sentences):
266  superpos_patterns = list( product( list(sentence[0].values())[0], list(sentence[1].values())[0], list(sentence[2].values())[0] ) )
267  # Generate all combinations of the bit-patterns for superpos states
268  for patt in superpos_patterns:
269  num = q.utils.encode_binary_pattern_direct(patt, encoding_dict)
270  vec_to_encode.extend([num])
271 
272 #Need to remove duplicates
273  vec_to_encode = list(set(vec_to_encode))
274  vec_to_encode.sort()
275 
276  d ={"sentences" : len(sentences),
277  "patterns" : len(vec_to_encode),
278  "NUM_BASIS_NOUN" : NUM_BASIS_NOUN,
279  "NUM_BASIS_VERB" : NUM_BASIS_VERB,
280  "BASIS_NOUN_DIST_CUTOFF" : BASIS_NOUN_DIST_CUTOFF,
281  "BASIS_VERB_DIST_CUTOFF" : BASIS_VERB_DIST_CUTOFF,
282  "VERB_NOUN_DIST_CUTOFF" : VERB_NOUN_DIST_CUTOFF
283  }
284  print(d)
285  print("Encoding data:", vec_to_encode)
286 
287  # Counter for experiment results; key exists for each potentially unique outcome defined by encoding
288  shot_counter = {}
289 
290  for i in vec_to_encode:
291  shot_counter.update({i : 0})
292 
293 # from IPython import embed; embed()
294 
295 else:
296  reg_memory = None
297  reg_aux = None
298  len_reg_memory = None
299  vec_to_encode = None
300  shot_counter = None
301 
302 reg_memory = comm.bcast(reg_memory, root=0)
303 reg_aux = comm.bcast(reg_aux, root=0)
304 vec_to_encode = comm.bcast(vec_to_encode, root=0)
305 shot_counter = comm.bcast(shot_counter, root=0)
306 
307 num_qubits = len(reg_memory) + len(reg_aux)
308 
309 #Explicitly disable fusion as it can cause incorrect results
310 use_fusion = False
311 
312 if os.environ.get('RESOURCE_EST') is not None:
313  print("Overriding default qubit count")
314  num_qubits = 1
315 
316 sim = p(num_qubits, use_fusion)
317 normalise = True
318 
319 if rank == 0:
320  try:
321  num_exps = int(sys.argv[2])
322  except Exception as e:
323  num_exps = 10
324 
325  pbar = tqdm(total=num_exps)
326  pbar.update(0)
327 else:
328  num_exps = None
329 
330 num_exps=comm.bcast(num_exps, root=0)
331 
332 num_faults = 0
333 
334 if rank == 0:
335  test_pattern = 0
336  test_string = ('hatter', 'say', 'queen')
337  test_pattern = q.utils.encode_binary_pattern(test_string, encoding_dict)
338 
339  print("Test string: {} pattern: {}".format(test_string, test_pattern))
340 else:
341  test_pattern = None
342 
343 test_pattern=comm.bcast(test_pattern, root=0)
344 comm.Barrier()
345 
346 for exp in range(num_exps):
347  sim.initRegister()
348 
349  if rank == 0:
350  print("Encoding {} patterns for experiment {} of {}".format(len(vec_to_encode), exp+1, num_exps))
351  sys.stdout.flush()
352 
353  # Encode
354  sim.encodeBinToSuperpos_unique(reg_memory, reg_aux, vec_to_encode, len(reg_memory))
355 
356  # Compute Hamming distance between test pattern and encoded patterns
357  sim.applyHammingDistanceRotY(test_pattern, reg_memory, reg_aux, len(reg_memory))
358 
359  # Measure
360  sim.collapseToBasisZ(reg_aux[len(reg_aux)-2], 1)
361 
362  val = sim.applyMeasurementToRegister(reg_memory, normalise)
363 
364  try:
365  shot_counter[val] += 1
366  except Exception as e:
367  print("Measured pattern {} does not match data for experiment {} of {}. Discarding.".format(val, exp+1, num_exps), file=sys.stderr)
368  sys.stderr.flush()
369  num_faults += 1
370  continue
371 
372  if rank == 0:
373  pbar.update(1)
374  print("Measured pattern {} for experiment {} of {}".format(val, exp+1, num_exps))
375  print(pbar)
376  sys.stdout.flush()
377 
378 if rank == 0:
379  pbar.close()
380  print("#"*48, "Shot counter values")
381  print(shot_counter)
382  print ("Number of faults: {}".format(num_faults))
383 
384  #Sort the keys by largest first
385  shot_counter_s = [ (k,v) for k, v in sorted(shot_counter.items(), key=lambda item: item[1], reverse=True)]
386 
387  key_order = [i[0] for i in shot_counter_s] #list(shot_counter.keys())
388 
389  xlab_str = [q.utils.decode_binary_pattern(i, decoding_dict) for i in key_order]
390  xlab_bin = ["{0:0{num_bits}b}".format(i, num_bits=len_reg_memory) for i in key_order ]
391 
392  pattern_dict = {k:v for k,v in zip(xlab_str, xlab_bin)}
393  pattern_count = {k:v for k,v in zip(xlab_str, [shot_counter[i] for i in key_order])}
394 
395  print("#"*48, "Shot counter values key to token state")
396  print(pattern_dict)
397  print("#"*48, "Token state counts")
398  print(pattern_count)
399 
400  import matplotlib as mpl
401  mpl.use('Agg')
402  import matplotlib.pyplot as plt
403 
404  #hist_list = list(zip(
405  # [i[0]+r" |"+i[1]+r">" for i in zip(xlab_str,xlab_bin)],
406  # [i/np.sum(list(shot_counter.values())) for i in list(shot_counter.values())]
407  #))
408 
409 
410  #labels = [x[0] for x in hist_list]
411  #post_vals = [y[1] for y in hist_list]
412 
413 
414  labels = xlab_str #[i[0] for i in shot_counter_s]
415  post_vals = [i[1] for i in shot_counter_s]
416 
417  pv_sum = np.sum(list(shot_counter.values()))
418  hist_list_hamCl = list(zip(
419  [",".join(i[0]) + r" "+ str(q.utils.HammingInt(test_pattern, int(i[1],2))) for i in zip(xlab_str,xlab_bin)],
420  [i/pv_sum for i in post_vals]
421  ))
422  labelsHCL = [x[0] for x in hist_list_hamCl]#labels#[x[0] for x in hist_list_hamCl]
423 
424  x = np.arange(len(labels)) # the label locations
425  width = 0.65 # the width of the bars
426 
427  fig, ax = plt.subplots()
428 
429  rects2 = ax.bar(x + width/2, post_vals, width, label='Measurement')
430 
431  test_pattern_str = q.utils.decode_binary_pattern(test_pattern, decoding_dict)
432  print("Test pattern={}".format(test_pattern_str))
433 
434  # Add some text for labels, title and custom x-axis tick labels, etc.
435  ax.set_ylabel(r"P({})".format(test_pattern_str),fontsize=24)
436  ax.set_xticks(x)
437  ax.tick_params(axis='both', which='major', labelsize=10)
438  ax.set_xticklabels(labels, rotation=-35, ha="left",fontsize=10)
439  ax.legend(fontsize=16)
440 
441  plt.axhline(y=1.0/len(shot_counter), color='crimson', linestyle="--")
442  plt.text(len(shot_counter)-0.1, 1.0/(len(shot_counter)), '1/sqrt(n)', horizontalalignment='left', verticalalignment='center', fontsize=16)
443  plt.tight_layout()
444  fig.set_size_inches(20, 12, forward=True)
445  plt.savefig("qnlp_e2e.pdf")
446 
447 
448 
449  fig, ax = plt.subplots()
450 
451  rects2 = ax.bar(x + width/2, post_vals, width, label='Measurement')
452 
453  # Add some text for labels, title and custom x-axis tick labels, etc.
454  ax.set_ylabel(r"P({})".format(test_pattern_str),fontsize=24)
455  ax.set_xticks(x)
456  ax.tick_params(axis='both', which='major', labelsize=10)
457  ax.set_xticklabels(labelsHCL, rotation=-35, ha="left",fontsize=10)
458  ax.legend(fontsize=16)
459 
460  plt.axhline(y=1.0/len(shot_counter), color='crimson', linestyle="--")
461  plt.text(len(shot_counter)-0.1, 1.0/(len(shot_counter)), '1/sqrt(n)', horizontalalignment='left', verticalalignment='center', fontsize=16)
462  plt.tight_layout()
463  fig.set_size_inches(20, 12, forward=True)
464  plt.savefig("qnlp_e2e_ham.pdf")
465 
466 
467 
468  import pickle
469 
470  import pickle
471  data = { "key_order" : key_order, "xlab_str" : xlab_str, "xlab_bin" : xlab_bin, "pattern_dict":pattern_dict, "pattern_count":pattern_count, "shot_counter" : shot_counter, "encoding_dict":encoding_dict, "decoding_dict" : decoding_dict}
472  f = open("qnlp_e2e_{}.pkl".format(test_pattern_str),"wb")
473  pickle.dump(data, f)
474  f.close()