13 from collections
import Counter
14 from nltk.corpus
import stopwords
15 sw = stopwords.words(
'english')
20 """Remove words that do not add to the meaning; simplifies sentences""" 21 return [w
for w
in text
if w
not in sw]
27 Pass the corpus as a string, which is subsequently broken into tokenized sentences, and returned as dictionary of verbs, nouns, tokenized words, and tokenized sentences. 30 corpus -- string representing the corpus to tokenize 31 proc_mode -- defines the processing mode. Lemmatization: proc_mode=\"l\"; Stemming: proc_mode=\"s\"; No additional processing: proc_mode=0 (default=0) 32 stop_words -- indicates whether stop words should be removed (False) or kept (True) (from nltk.corpus.stopwords.words(\"english\")) 35 token_sents = nltk.sent_tokenize(corpus)
40 tk = nltk.word_tokenize(s)
41 if stop_words ==
False:
43 token_words.extend(tk)
44 tags.extend(nltk.pos_tag(tk))
48 s = nltk.SnowballStemmer(
'english', ignore_stopwords =
not stop_words)
49 token_words = [s.stem(t)
for t
in token_words]
50 elif proc_mode ==
'l':
51 wnl = nltk.WordNetLemmatizer()
52 token_words = [wnl.lemmatize(t)
for t
in token_words]
54 tags = nltk.pos_tag(token_words)
55 nouns = [i[0]
for i
in tags
if tg.matchables(tg.Noun, i[1])]
56 verbs = [i[0]
for i
in tags
if tg.matchables(tg.Verb, i[1])]
58 count_nouns = Counter(nouns)
59 count_verbs = Counter(verbs)
61 return {
'verbs':count_verbs,
'nouns':count_nouns,
'tk_sentence':token_sents,
'tk_words':token_words}
66 """Load the corpus from disk.""" 68 with open(corpus_path,
'r')
as corpusFile:
69 corpus_text=corpusFile.read()
73 """Tokenize the corpus data.""" 81 Examine the words around each verb with the specified window size, and attempt to match the NVN pattern. The window_size specifies the number of values around each verb to search for the matching nouns. If passed as tuple 82 (l,r) gives the left and right windows separately. If passed as a scalar, both values are equal. 85 words -- list of the tokenized words 86 window_size -- window to search for word pairings. Tuple 90 sentence_boundary = set([i
for i, x
in enumerate(words[
'tk_words'])
if "." in x])
92 if type(window_size)
is tuple:
93 window_left, window_right = window_size
95 window_left = window_size
96 window_right = window_size
99 for v
in words[
'verbs']:
100 verb_idx.update({v:[i
for i, x
in enumerate(words[
'tk_words'])
if v
in x]})
102 for n
in words[
'nouns']:
103 for v,k
in verb_idx.items():
106 l_left = len(sentence_boundary.intersection(range(v_idx-window_left,v_idx)))
107 l_right = len(sentence_boundary.intersection(range(v_idx,v_idx+window_right)))
109 if l_left==0
and n
in words[
'tk_words'][v_idx-window_left:v_idx]:
111 if l_right==0
and (v_idx + window_right) < len(words[
'tk_words'])
and n
in words[
'tk_words'][v_idx:v_idx+window_right] :
118 Calculate the required number of qubits to encode the NVN state-space of the corpus. 119 Returns tuple of required states and number of needed qubits. 120 While it is possible to reduce qubit count by considering the total required number as being the product of each combo, it is easier to deal with separate states for nouns and verbs, with at most 1 extra qubit required. 123 words -- list of the tokenized words 125 nvn_space_size = len(words[
'nouns'])**2 * len(words[
'verbs'])
126 req_qubits_n = int(np.ceil(np.log2(len(words[
'nouns']))))
127 req_qubits_v = int(np.ceil(np.log2(len(words[
'verbs']))))
130 req_qubits = req_qubits_n*2 + req_qubits_v
132 print(
"Unique states:",nvn_space_size,
"\tRequired qubits total:", req_qubits,
"\tRequired qubits nouns:", req_qubits_n,
"\tRequired qubits verbs:", req_qubits_v)
133 return (nvn_space_size, req_qubits, req_qubits_n, req_qubits_v)
137 Maps the unique string in each respective space to a binary basis number for qubit representation. 140 words -- list of the tokenized words 141 db_name -- name of the database file 142 table_name -- name of the table to store data in db_name 145 _, num_q_total, num_q_n, num_q_v =
num_qubits(words)
149 for i,v
in enumerate(words[
'nouns']):
151 nounMap.update({v:format(i,
'b').zfill(num_q_n)})
152 nounMap.update({format(i,
'b').zfill(num_q_n):v})
154 for i,v
in enumerate(words[
'verbs']):
155 verbMap.update({v:format(i,
'b').zfill(num_q_v)})
156 verbMap.update({format(i,
'b').zfill(num_q_v):v})
158 db = qdb.qnlp_db(db_name,
".")
159 db.db_insert(nounMap,
"noun", table_name)
160 db.db_insert(verbMap,
"verb", table_name)
162 return (nounMap, verbMap)
167 def run(BasisPath, CorpusPath, proc_mode=0, db_name="qnlp_tagged_corpus"):
173 with open(BasisPath,
'r')
as basisFile:
174 basis_text=basisFile.read()
176 with open(CorpusPath,
'r')
as corpusFile:
177 corpus_text=corpusFile.read()
183 sentence_boundary = set([i
for i, x
in enumerate(corpus_words[
'tk_words'])
if "." in x])
191 _, num_q_total, num_q_n, num_q_v =
num_qubits(basis_words)
def word_pairing(words, window_size)
def remove_stopwords(text, sw)
def mapNameToBinaryBasis(words, db_name, table_name="qnlp")
def tokenize_corpus(corpus, proc_mode=0, stop_words=True)
def process(corpus_path, proc_mode=0)
def run(BasisPath, CorpusPath, proc_mode=0, db_name="qnlp_tagged_corpus")
def load_corpus(corpus_path, proc_mode=0)