This commit is contained in:
parent
873e9ff7d2
commit
66e4b972eb
|
@ -0,0 +1,32 @@
|
||||||
|
GGrußformeln asm Anfang raus
|
||||||
|
|
||||||
|
whitelist (inkl. kb-keywords)
|
||||||
|
akronyme & abk. drin lassen
|
||||||
|
|
||||||
|
tagging vor normalisierung
|
||||||
|
|
||||||
|
groß/klein rumexperimetieren
|
||||||
|
|
||||||
|
bigramme nicht auf normtext
|
||||||
|
|
||||||
|
relevanz bestimmter wörter
|
||||||
|
|
||||||
|
zahlen drin lassen
|
||||||
|
|
||||||
|
ticket-subj mit einbeziehen
|
||||||
|
|
||||||
|
topics nach lda von itmc bestimmen lassen
|
||||||
|
|
||||||
|
baumhieracrchie der categrory einbezihen (ggf. datensatz verbessern)
|
||||||
|
|
||||||
|
aktuelle technische bgriffe autoimatisch in whitelist aufnehmen
|
||||||
|
|
||||||
|
levenstein/hamming distanz statt autokorrekt (wenn kleiner als x dann ists das gleiche wort)
|
||||||
|
|
||||||
|
TODO mittwoch: volltestindizierung (Termhäufigkeiten, bei zahlen vorgänger/nachfolger als ein term)
|
||||||
|
|
||||||
|
hautpverb (root) drin lassen
|
||||||
|
|
||||||
|
kategroien verkleinern: onthologien/ornamigram
|
||||||
|
|
||||||
|
Footer/Header raus
|
13
main.py
13
main.py
|
@ -31,6 +31,19 @@ start = time.time()
|
||||||
# todo modelle testen
|
# todo modelle testen
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
logprint("main.py started at {}".format(datetime.now()))
|
logprint("main.py started at {}".format(datetime.now()))
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -16,6 +16,7 @@ import glob, os
|
||||||
from textacy.fileio import open_sesame
|
from textacy.fileio import open_sesame
|
||||||
import json
|
import json
|
||||||
from spacy.tokens.doc import Doc as SpacyDoc
|
from spacy.tokens.doc import Doc as SpacyDoc
|
||||||
|
import operator
|
||||||
|
|
||||||
csv.field_size_limit(sys.maxsize)
|
csv.field_size_limit(sys.maxsize)
|
||||||
FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/"
|
FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/"
|
||||||
|
@ -124,6 +125,10 @@ def list_from_files(*paths):
|
||||||
def breakpoint():
|
def breakpoint():
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
def sort_dictionary(dict):
|
||||||
|
return sorted(dict.items(), key=operator.itemgetter(1))
|
||||||
|
|
||||||
|
|
||||||
def normalize(string):
|
def normalize(string):
|
||||||
# replaceRockDots
|
# replaceRockDots
|
||||||
string = re.sub(r'[ß]', "ss", string.lower())
|
string = re.sub(r'[ß]', "ss", string.lower())
|
||||||
|
|
|
@ -296,6 +296,7 @@ path2firstnameslist = ressources_path + config.get("firstnames","pickle_file")
|
||||||
|
|
||||||
|
|
||||||
path2DEstopwordlist = ressources_path + config.get("de_stopwords", "pickle_file")
|
path2DEstopwordlist = ressources_path + config.get("de_stopwords", "pickle_file")
|
||||||
|
|
||||||
path2ENstopwordlist = ressources_path + config.get("en_stopwords", "pickle_file")
|
path2ENstopwordlist = ressources_path + config.get("en_stopwords", "pickle_file")
|
||||||
|
|
||||||
custom_words = get_list_from_config("preprocessing", "custom_words")
|
custom_words = get_list_from_config("preprocessing", "custom_words")
|
||||||
|
|
172
test.py
172
test.py
|
@ -21,18 +21,18 @@ FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/"
|
||||||
import draw
|
import draw
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
# load corpus
|
# load corpus
|
||||||
corpus_de_path = FILEPATH + config.get("de_corpus", "path")
|
corpus_de_path = FILEPATH + config.get("de_corpus", "path")
|
||||||
preCorpus_name = "de" + "_pre_ticket"
|
preCorpus_name = "de" + "_pre_ticket"
|
||||||
corpus, parser = load_corpus(corpus_name=preCorpus_name, corpus_path=corpus_de_path)
|
corpus, parser = load_corpus(corpus_name=preCorpus_name, corpus_path=corpus_de_path)
|
||||||
logprint("Corpus loaded: {0}".format(corpus.lang))
|
logprint("Corpus loaded: {0}".format(corpus.lang))
|
||||||
|
#todo randomize corpus
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#todo randomize
|
|
||||||
|
|
||||||
|
|
||||||
split = 0.8
|
split = 0.8
|
||||||
|
@ -64,9 +64,10 @@ model.fit(doc_term_matrix)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
compenents = model.model.components_
|
compenents = model.model.components_
|
||||||
|
|
||||||
"""
|
|
||||||
components_ : array, [n_components, n_features]
|
components_ : array, [n_components, n_features]
|
||||||
|
|
||||||
Variational parameters for topic word distribution.
|
Variational parameters for topic word distribution.
|
||||||
|
@ -78,9 +79,78 @@ the number of times word j was assigned to topic i.
|
||||||
It can also be viewed as distribution over the words for each topic after normalization:
|
It can also be viewed as distribution over the words for each topic after normalization:
|
||||||
model.components_ / model.components_.sum(axis=1)[:, np.newaxis].
|
model.components_ / model.components_.sum(axis=1)[:, np.newaxis].
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
test_doc = corpus_test[0]
|
test_doc = corpus_test[0]
|
||||||
|
bla = test_doc.to_bag_of_terms(ngrams=1, named_entities=True, normalize=u'lower', lemmatize=None, lowercase=True, weighting=u'count', as_strings=False)
|
||||||
|
key_list = bla.keys()
|
||||||
|
bla_list = list(bla)
|
||||||
|
|
||||||
|
print(bla)
|
||||||
|
print(bla_list)
|
||||||
|
|
||||||
|
for k in bla.keys():
|
||||||
|
print(id2term[k])
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
ressources_path = FILEPATH + "ressources/"
|
||||||
|
path2DEstopwordlist = ressources_path + config.get("de_stopwords", "pickle_file")
|
||||||
|
DE_STOP_WORDS = load_obj(path2DEstopwordlist)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# load corpus
|
||||||
|
corpus_de_path = FILEPATH + config.get("de_corpus", "path")
|
||||||
|
rawCorpus_name = "de" + "_raw_ticket"
|
||||||
|
corpus, parser = load_corpus(corpus_name=rawCorpus_name, corpus_path=corpus_de_path)
|
||||||
|
|
||||||
|
#parser = spacy.load("de")
|
||||||
|
#corpus = textacy.Corpus(parser)
|
||||||
|
|
||||||
|
#TODO mittwoch: volltestindizierung (Termhäufigkeiten, bei zahlen vorgänger/nachfolger als ein term)
|
||||||
|
"""
|
||||||
|
testtxt = "Sehr geehrtes ITMC Service Team,\r\n\r\nseit ein einiger Zeit scheint der Netzwerkanschluss eines Kollegen" \
|
||||||
|
" an das Intranet der BMP mit der Dosennummer G1 303/04/12.05 (G1 4 26-1) in Raum G1-426 nicht mehr zu funktionieren. " \
|
||||||
|
"\r\nIch würde Sie daher bitten diese Mail an den zuständigen Kollegen weiterzuleiten, um die Leitung vielleicht einmal zu Prüfen.\r\n\r\n" \
|
||||||
|
"Des Weiteren hätte ich noch eine Frage bezüglich der Möglichkeit zur Nutzung einer VPN Verbindung aus" \
|
||||||
|
" unserem Intranet heraus zu einem fremden Netzwerk. Dies ist zwar über das WLAN-Netz möglich, jedoch nicht " \
|
||||||
|
"aus unserem Netzwerk heraus. Vielleicht können Sie mir mitteilen an welchen Kollegen ich mich bezüglich" \
|
||||||
|
" dieses Problem wenden kann.\r\n\r\nBei Rückfragen stehe ich gerne zur Verfügung!\r\n\r\nBeste Grüße," \
|
||||||
|
"\r\n\r\nNicolas Rauner\r\n\r\nLS Biomaterialien und Polymerwissenschaften\r\nFakultät Bio- und Chemieingenieurwesen\r\nTU Dortmund" \
|
||||||
|
" \r\nD-44227 Dortmund\r\n\r\nTel: + 49-(0)231 / 755 - 3015\r\nFax: + 49-(0)231 / 755 - 2480\r\n\r\nwww.ls-bmp.de <http://www.ls-bmp.de/>"
|
||||||
|
|
||||||
|
#corpus.add_text(testtxt)
|
||||||
|
"""
|
||||||
|
|
||||||
|
term_dict_w_stop = {}
|
||||||
|
term_dict_wo_stop = {}
|
||||||
|
footings = ["gruss", "grusse", "gruesse", "gruessen", "grusses"]
|
||||||
|
|
||||||
|
for doc in corpus:
|
||||||
|
|
||||||
|
tokens = [tok for tok in doc]
|
||||||
|
|
||||||
|
# footer raus
|
||||||
|
|
||||||
|
|
||||||
|
for i,tok in enumerate(tokens):
|
||||||
|
text = tok.text
|
||||||
|
text = re.sub(r'[ß]', "ss", text)
|
||||||
|
text = re.sub(r'[ö]', "oe", text)
|
||||||
|
text = re.sub(r'[ü]', "ue", text)
|
||||||
|
text = re.sub(r'[ä]', "ae", text)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
for gr in footings:
|
||||||
|
if gr in text.lower():
|
||||||
|
tokens = tokens[0:i]
|
||||||
|
#print(tokens)
|
||||||
|
break
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -88,12 +158,85 @@ test_doc = corpus_test[0]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
for i,tok in enumerate(tokens):
|
||||||
|
|
||||||
|
if tok.is_space or tok.is_punct or tok.like_url or tok.like_email:
|
||||||
|
continue
|
||||||
|
|
||||||
|
|
||||||
|
if i is not 0:
|
||||||
|
#text = tok.text if tokens[i-1].pos_ is not "NUM" else tok.text+" "+tokens[i-1].text
|
||||||
|
|
||||||
|
|
||||||
|
if tokens[i-1].like_num:
|
||||||
|
text = tokens[i - 1].text + " " + tok.text
|
||||||
|
else:
|
||||||
|
text = tok.text
|
||||||
|
|
||||||
|
else:
|
||||||
|
text = tok.text
|
||||||
|
|
||||||
|
|
||||||
|
# replaceRockDots
|
||||||
|
text = re.sub(r'[ß]', "ss", text)
|
||||||
|
text = re.sub(r'[ö]', "oe", text)
|
||||||
|
text = re.sub(r'[ü]', "ue", text)
|
||||||
|
text = re.sub(r'[ä]', "ae", text)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if text not in term_dict_w_stop.keys():
|
||||||
|
term_dict_w_stop[text] = 1
|
||||||
|
else:
|
||||||
|
term_dict_w_stop[text] += 1
|
||||||
|
|
||||||
|
|
||||||
|
if text.lower() not in DE_STOP_WORDS:
|
||||||
|
if text not in term_dict_wo_stop.keys():
|
||||||
|
term_dict_wo_stop[text] = 1
|
||||||
|
else:
|
||||||
|
term_dict_wo_stop[text] += 1
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
term_dict_sorted = sort_dictionary(term_dict_w_stop)
|
||||||
|
term_dict_wo_sorted = sort_dictionary(term_dict_wo_stop)
|
||||||
|
|
||||||
|
split_value = 0.2
|
||||||
|
from_ = int((1-split_value) * float(len(term_dict_sorted))) #1-splt
|
||||||
|
to_ = len(term_dict_sorted)
|
||||||
|
|
||||||
|
#logprint(term_dict_sorted[from_: to_])
|
||||||
|
#logprint("\n")
|
||||||
|
#logprint(term_dict_wo_sorted[from_: to_])
|
||||||
|
|
||||||
|
|
||||||
|
for elem in term_dict_sorted:
|
||||||
|
logprint(elem)
|
||||||
|
|
||||||
|
logprint("\n")
|
||||||
|
logprint("\n")
|
||||||
|
logprint("\n")
|
||||||
|
logprint("\n")
|
||||||
|
|
||||||
|
for elem in term_dict_wo_sorted:
|
||||||
|
logprint(elem)
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
in_path= "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/log/terms_without_stop.txt"
|
||||||
|
out_path= "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/log/terms_without_stopwords.txt"
|
||||||
|
|
||||||
|
gen=reversed(list(open(in_path)))
|
||||||
|
|
||||||
|
|
||||||
|
textacy.fileio.write_file_lines(gen,out_path)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -149,27 +292,6 @@ print("\n\n\nTime Elapsed Test:{0}\n\n".format(end - start))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue