This commit is contained in:
parent
873e9ff7d2
commit
66e4b972eb
|
@ -0,0 +1,32 @@
|
|||
GGrußformeln asm Anfang raus
|
||||
|
||||
whitelist (inkl. kb-keywords)
|
||||
akronyme & abk. drin lassen
|
||||
|
||||
tagging vor normalisierung
|
||||
|
||||
groß/klein rumexperimetieren
|
||||
|
||||
bigramme nicht auf normtext
|
||||
|
||||
relevanz bestimmter wörter
|
||||
|
||||
zahlen drin lassen
|
||||
|
||||
ticket-subj mit einbeziehen
|
||||
|
||||
topics nach lda von itmc bestimmen lassen
|
||||
|
||||
baumhieracrchie der categrory einbezihen (ggf. datensatz verbessern)
|
||||
|
||||
aktuelle technische bgriffe autoimatisch in whitelist aufnehmen
|
||||
|
||||
levenstein/hamming distanz statt autokorrekt (wenn kleiner als x dann ists das gleiche wort)
|
||||
|
||||
TODO mittwoch: volltestindizierung (Termhäufigkeiten, bei zahlen vorgänger/nachfolger als ein term)
|
||||
|
||||
hautpverb (root) drin lassen
|
||||
|
||||
kategroien verkleinern: onthologien/ornamigram
|
||||
|
||||
Footer/Header raus
|
13
main.py
13
main.py
|
@ -31,6 +31,19 @@ start = time.time()
|
|||
# todo modelle testen
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
logprint("main.py started at {}".format(datetime.now()))
|
||||
|
||||
|
||||
|
|
|
@ -16,6 +16,7 @@ import glob, os
|
|||
from textacy.fileio import open_sesame
|
||||
import json
|
||||
from spacy.tokens.doc import Doc as SpacyDoc
|
||||
import operator
|
||||
|
||||
csv.field_size_limit(sys.maxsize)
|
||||
FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/"
|
||||
|
@ -124,6 +125,10 @@ def list_from_files(*paths):
|
|||
def breakpoint():
|
||||
pass
|
||||
|
||||
def sort_dictionary(dict):
|
||||
return sorted(dict.items(), key=operator.itemgetter(1))
|
||||
|
||||
|
||||
def normalize(string):
|
||||
# replaceRockDots
|
||||
string = re.sub(r'[ß]', "ss", string.lower())
|
||||
|
|
|
@ -296,6 +296,7 @@ path2firstnameslist = ressources_path + config.get("firstnames","pickle_file")
|
|||
|
||||
|
||||
path2DEstopwordlist = ressources_path + config.get("de_stopwords", "pickle_file")
|
||||
|
||||
path2ENstopwordlist = ressources_path + config.get("en_stopwords", "pickle_file")
|
||||
|
||||
custom_words = get_list_from_config("preprocessing", "custom_words")
|
||||
|
|
172
test.py
172
test.py
|
@ -21,18 +21,18 @@ FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/"
|
|||
import draw
|
||||
|
||||
|
||||
|
||||
"""
|
||||
# load corpus
|
||||
corpus_de_path = FILEPATH + config.get("de_corpus", "path")
|
||||
preCorpus_name = "de" + "_pre_ticket"
|
||||
corpus, parser = load_corpus(corpus_name=preCorpus_name, corpus_path=corpus_de_path)
|
||||
logprint("Corpus loaded: {0}".format(corpus.lang))
|
||||
#todo randomize corpus
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
#todo randomize
|
||||
|
||||
|
||||
split = 0.8
|
||||
|
@ -64,9 +64,10 @@ model.fit(doc_term_matrix)
|
|||
|
||||
|
||||
|
||||
|
||||
compenents = model.model.components_
|
||||
|
||||
"""
|
||||
|
||||
components_ : array, [n_components, n_features]
|
||||
|
||||
Variational parameters for topic word distribution.
|
||||
|
@ -78,9 +79,78 @@ the number of times word j was assigned to topic i.
|
|||
It can also be viewed as distribution over the words for each topic after normalization:
|
||||
model.components_ / model.components_.sum(axis=1)[:, np.newaxis].
|
||||
|
||||
"""
|
||||
|
||||
|
||||
test_doc = corpus_test[0]
|
||||
bla = test_doc.to_bag_of_terms(ngrams=1, named_entities=True, normalize=u'lower', lemmatize=None, lowercase=True, weighting=u'count', as_strings=False)
|
||||
key_list = bla.keys()
|
||||
bla_list = list(bla)
|
||||
|
||||
print(bla)
|
||||
print(bla_list)
|
||||
|
||||
for k in bla.keys():
|
||||
print(id2term[k])
|
||||
|
||||
"""
|
||||
|
||||
|
||||
"""
|
||||
|
||||
ressources_path = FILEPATH + "ressources/"
|
||||
path2DEstopwordlist = ressources_path + config.get("de_stopwords", "pickle_file")
|
||||
DE_STOP_WORDS = load_obj(path2DEstopwordlist)
|
||||
|
||||
|
||||
|
||||
# load corpus
|
||||
corpus_de_path = FILEPATH + config.get("de_corpus", "path")
|
||||
rawCorpus_name = "de" + "_raw_ticket"
|
||||
corpus, parser = load_corpus(corpus_name=rawCorpus_name, corpus_path=corpus_de_path)
|
||||
|
||||
#parser = spacy.load("de")
|
||||
#corpus = textacy.Corpus(parser)
|
||||
|
||||
#TODO mittwoch: volltestindizierung (Termhäufigkeiten, bei zahlen vorgänger/nachfolger als ein term)
|
||||
"""
|
||||
testtxt = "Sehr geehrtes ITMC Service Team,\r\n\r\nseit ein einiger Zeit scheint der Netzwerkanschluss eines Kollegen" \
|
||||
" an das Intranet der BMP mit der Dosennummer G1 303/04/12.05 (G1 4 26-1) in Raum G1-426 nicht mehr zu funktionieren. " \
|
||||
"\r\nIch würde Sie daher bitten diese Mail an den zuständigen Kollegen weiterzuleiten, um die Leitung vielleicht einmal zu Prüfen.\r\n\r\n" \
|
||||
"Des Weiteren hätte ich noch eine Frage bezüglich der Möglichkeit zur Nutzung einer VPN Verbindung aus" \
|
||||
" unserem Intranet heraus zu einem fremden Netzwerk. Dies ist zwar über das WLAN-Netz möglich, jedoch nicht " \
|
||||
"aus unserem Netzwerk heraus. Vielleicht können Sie mir mitteilen an welchen Kollegen ich mich bezüglich" \
|
||||
" dieses Problem wenden kann.\r\n\r\nBei Rückfragen stehe ich gerne zur Verfügung!\r\n\r\nBeste Grüße," \
|
||||
"\r\n\r\nNicolas Rauner\r\n\r\nLS Biomaterialien und Polymerwissenschaften\r\nFakultät Bio- und Chemieingenieurwesen\r\nTU Dortmund" \
|
||||
" \r\nD-44227 Dortmund\r\n\r\nTel: + 49-(0)231 / 755 - 3015\r\nFax: + 49-(0)231 / 755 - 2480\r\n\r\nwww.ls-bmp.de <http://www.ls-bmp.de/>"
|
||||
|
||||
#corpus.add_text(testtxt)
|
||||
"""
|
||||
|
||||
term_dict_w_stop = {}
|
||||
term_dict_wo_stop = {}
|
||||
footings = ["gruss", "grusse", "gruesse", "gruessen", "grusses"]
|
||||
|
||||
for doc in corpus:
|
||||
|
||||
tokens = [tok for tok in doc]
|
||||
|
||||
# footer raus
|
||||
|
||||
|
||||
for i,tok in enumerate(tokens):
|
||||
text = tok.text
|
||||
text = re.sub(r'[ß]', "ss", text)
|
||||
text = re.sub(r'[ö]', "oe", text)
|
||||
text = re.sub(r'[ü]', "ue", text)
|
||||
text = re.sub(r'[ä]', "ae", text)
|
||||
|
||||
|
||||
|
||||
for gr in footings:
|
||||
if gr in text.lower():
|
||||
tokens = tokens[0:i]
|
||||
#print(tokens)
|
||||
break
|
||||
|
||||
|
||||
|
||||
|
@ -88,12 +158,85 @@ test_doc = corpus_test[0]
|
|||
|
||||
|
||||
|
||||
for i,tok in enumerate(tokens):
|
||||
|
||||
if tok.is_space or tok.is_punct or tok.like_url or tok.like_email:
|
||||
continue
|
||||
|
||||
|
||||
if i is not 0:
|
||||
#text = tok.text if tokens[i-1].pos_ is not "NUM" else tok.text+" "+tokens[i-1].text
|
||||
|
||||
|
||||
if tokens[i-1].like_num:
|
||||
text = tokens[i - 1].text + " " + tok.text
|
||||
else:
|
||||
text = tok.text
|
||||
|
||||
else:
|
||||
text = tok.text
|
||||
|
||||
|
||||
# replaceRockDots
|
||||
text = re.sub(r'[ß]', "ss", text)
|
||||
text = re.sub(r'[ö]', "oe", text)
|
||||
text = re.sub(r'[ü]', "ue", text)
|
||||
text = re.sub(r'[ä]', "ae", text)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
if text not in term_dict_w_stop.keys():
|
||||
term_dict_w_stop[text] = 1
|
||||
else:
|
||||
term_dict_w_stop[text] += 1
|
||||
|
||||
|
||||
if text.lower() not in DE_STOP_WORDS:
|
||||
if text not in term_dict_wo_stop.keys():
|
||||
term_dict_wo_stop[text] = 1
|
||||
else:
|
||||
term_dict_wo_stop[text] += 1
|
||||
|
||||
|
||||
|
||||
|
||||
term_dict_sorted = sort_dictionary(term_dict_w_stop)
|
||||
term_dict_wo_sorted = sort_dictionary(term_dict_wo_stop)
|
||||
|
||||
split_value = 0.2
|
||||
from_ = int((1-split_value) * float(len(term_dict_sorted))) #1-splt
|
||||
to_ = len(term_dict_sorted)
|
||||
|
||||
#logprint(term_dict_sorted[from_: to_])
|
||||
#logprint("\n")
|
||||
#logprint(term_dict_wo_sorted[from_: to_])
|
||||
|
||||
|
||||
for elem in term_dict_sorted:
|
||||
logprint(elem)
|
||||
|
||||
logprint("\n")
|
||||
logprint("\n")
|
||||
logprint("\n")
|
||||
logprint("\n")
|
||||
|
||||
for elem in term_dict_wo_sorted:
|
||||
logprint(elem)
|
||||
|
||||
"""
|
||||
|
||||
|
||||
in_path= "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/log/terms_without_stop.txt"
|
||||
out_path= "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/log/terms_without_stopwords.txt"
|
||||
|
||||
gen=reversed(list(open(in_path)))
|
||||
|
||||
|
||||
textacy.fileio.write_file_lines(gen,out_path)
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -149,27 +292,6 @@ print("\n\n\nTime Elapsed Test:{0}\n\n".format(end - start))
|
|||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
"""
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue