diff --git a/java_LabledLDA/models/tickets/tickets.gz b/java_LabledLDA/models/tickets/tickets.gz index fd68e5b..e21f988 100644 Binary files a/java_LabledLDA/models/tickets/tickets.gz and b/java_LabledLDA/models/tickets/tickets.gz differ diff --git a/testo.py b/testo.py index b3ff86c..46849ef 100644 --- a/testo.py +++ b/testo.py @@ -7,8 +7,8 @@ print(datetime.now()) #path2csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_med.csv" -path2csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_small.csv" -#path2csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/de_tickets.csv" +#path2csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_small.csv" +path2csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/de_tickets.csv" path_csv_split = path2csv.split("/") print(path_csv_split[len(path_csv_split)-1]) @@ -501,7 +501,7 @@ def build_thesaurus(path2lexicalentries, path2synsets): THESAURUS=[] -#THESAURUS=build_thesaurus(path2lexicalentries=lexicalentries,path2synsets=synsets) #todo anschalten +THESAURUS=build_thesaurus(path2lexicalentries=lexicalentries,path2synsets=synsets) def getFirstSynonym(word, thesaurus=THESAURUS): if not isinstance(word, str): @@ -541,7 +541,7 @@ def words(text): return re.findall(r'\w+', text.lower()) WORDS={} -#WORDS = Counter(words(open(path2words).read())) #todo anschalten +WORDS = Counter(words(open(path2words).read())) def P(word, N=sum(WORDS.values())): "Probability of `word`." @@ -629,10 +629,10 @@ def stringcleaning(stringstream): string = " ".join([lemmatizeWord(word) for word in string.split()]) # synonyme normalisieren #idee vor oder nach lemmatize? - #string = " ".join([getFirstSynonym(word) for word in string.split()]) + string = " ".join([getFirstSynonym(word) for word in string.split()]) # autocorrect - #string = " ".join([autocorrectWord(word) for word in string.split()]) + string = " ".join([autocorrectWord(word) for word in string.split()]) yield string @@ -745,7 +745,7 @@ custom_words=["geehrt","dame","herr","hilfe","problem","lauten","bedanken","vora "auffahren","vorgang","hinweis","institut","universitaet","name","gruss","id","erfolg","mail","folge", "nummer","team","fakultaet","email","absender","tu","versenden","vorname","message", "service","strasse","prozess","portal","raum","personal","moeglichkeit","fremd","wende","rueckfrage", "stehen", "verfuegung" - "funktionieren","kollege", "pruefen" + "funktionieren","kollege", "pruefen","hoffen" ] @@ -896,9 +896,15 @@ weighting = 'tf' # weighting ='tfidf' named_entities = False -#printvecotorization(ngrams=ngrams,min_df=min_df,max_df=max_df,weighting=weighting,named_entities=named_entities) +printvecotorization(ngrams=1,min_df=1,max_df=1.0,weighting=weighting) +printvecotorization(ngrams=1,min_df=1,max_df=0.5,weighting=weighting) +printvecotorization(ngrams=1,min_df=1,max_df=0.8,weighting=weighting) +printvecotorization(ngrams=(1,2),min_df=1,max_df=1.0,weighting=weighting) +printvecotorization(ngrams=(1,2),min_df=1,max_df=0.5,weighting=weighting) +printvecotorization(ngrams=(1,2),min_df=1,max_df=0.8,weighting=weighting) + """ diff --git a/testra.py b/testra.py index 0be8eac..5b81579 100644 --- a/testra.py +++ b/testra.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- import re import time +import json import spacy import textacy @@ -16,7 +17,116 @@ import xml.etree.ElementTree as ET print(datetime.now()) -#PARSER=spacy.load("de") +PARSER=spacy.load("de") + + +corpus = textacy.Corpus(PARSER) + +testcontetn = [ + "fdsfdsfsd", + "juzdtjlkö", + "gfadojplk" +] + +testmetda = [ + {"categoryName":"zhb","Solution":"","Subject":"schulungstest"}, + {"categoryName":"neuanschluss","Solution":"subject","Subject":"telephone contract"}, + {"categoryName":"zhb","Solution":"","Subject":"setuji"} +] + + +def makecontent(testcontetn): + for content in testcontetn: + yield content + + +def makemeta( testmetda): + for metdata in testmetda: + yield metdata + + +corpus.add_texts( + makecontent(testcontetn), + makemeta(testmetda) +) + +print(corpus) + + +corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpus/" +corpus_name = "testcorpus" + +""" +#corpus.save(corpus_path, name=corpus_name, compression=corpus_compression) +#corpus = textacy.Corpus.load(corpus_path, name=corpus_name, compression=corpus_compression) + + + +import pathlib + +strings_path = pathlib.Path(corpus_path + 'strings.json') +path_lexemes_bin_ = pathlib.Path(corpus_path + 'lexemes.bin') + +PARSER.vocab.dump(path_lexemes_bin_) +nlp.vocab.load_lexemes(path_lexemes_bin_) +""" + +def save_corpus(corpus_path,corpus_name): + + # save stringstore + stringstore_path = corpus_path + corpus_name + '_strings.json' + with open(stringstore_path, "w") as file: + PARSER.vocab.strings.dump(file) + + + #save content + contentpath = corpus_path + corpus_name+ "_content.bin" + textacy.fileio.write_spacy_docs((doc.spacy_doc for doc in corpus),contentpath) + + + #save meta + metapath = corpus_path + corpus_name +"_meta.json" + textacy.fileio.write_json_lines((doc.metadata for doc in corpus), metapath) + + + +def load_corpus(corpus_path,corpus_name): + # load new lang + nlp = spacy.load("de") + + #load stringstore + stringstore_path = corpus_path + corpus_name + '_strings.json' + with open(stringstore_path,"r") as file: + nlp.vocab.strings.load(file) + + # define corpus + corpus = textacy.Corpus(nlp) + + # load meta + metapath = corpus_path + corpus_name +"_meta.json" + metadata_stream = textacy.fileio.read_json_lines(metapath) + + #load content + contentpath = corpus_path + corpus_name+ "_content.bin" + spacy_docs = textacy.fileio.read_spacy_docs(corpus.spacy_vocab, contentpath) + + for spacy_doc, metadata in zip(spacy_docs, metadata_stream): + corpus.add_doc( + textacy.Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata)) + + return corpus + + +save_corpus(corpus_path,corpus_name) + +print(load_corpus(corpus_path,corpus_name)) + + + + + +#idee das auch mit spellchecker, lemmetaizer und thesaurus machen wegen memory +# todo generators immer neu laden? wegen laufzeit-nacheinander-picking, denn sonst nicht det """ @@ -97,7 +207,7 @@ for r in root: - +""" import re from collections import Counter @@ -135,7 +245,7 @@ def edits2(word): "All edits that are two edits away from `word`." return (e2 for e1 in edits1(word) for e2 in edits1(e1)) - +"""