diff --git a/cleaning.py b/cleaning.py new file mode 100644 index 0000000..55fd812 --- /dev/null +++ b/cleaning.py @@ -0,0 +1,242 @@ +# -*- coding: utf-8 -*- + +from datetime import datetime +import csv +import sys +from miscellaneous import * +from datetime import datetime +import time +import textacy +from scipy import * + +import os + +csv.field_size_limit(sys.maxsize) +FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/" + + + +# load config +config_ini = FILEPATH + "config.ini" + +config = ConfigParser.ConfigParser() +with open(config_ini) as f: + config.read_file(f) + + +global REGEX_SPECIALCHAR + +global WORDS + + +REGEX_SPECIALCHAR = r'[`\-=~%^&*()_+\[\]{};\'\\:"|]' #+r',.' + +WORDS= {} + + +########################## Spellchecking ########################################## +# http://norvig.com/spell-correct.html +# http://wortschatz.uni-leipzig.de/en/download + +import re + + +def words(text): return re.findall(r'\w+', text.lower()) + +def P(word, N=sum(WORDS.values())): + "Probability of `word`." + return WORDS[word] / N + + +def correction(word): + "Most probable spelling correction for word." + return max(candidates(word), key=P) + + +def candidates(word): + "Generate possible spelling corrections for word." + return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word]) + + +def known(words): + "The subset of `words` that appear in the dictionary of WORDS." + return set(w for w in words if w in WORDS) + + +def edits1(word): + "All edits that are one edit away from `word`." + letters = 'abcdefghijklmnopqrstuvwxyz' + splits = [(word[:i], word[i:]) for i in range(len(word) + 1)] + deletes = [L + R[1:] for L, R in splits if R] + transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1] + replaces = [L + c + R[1:] for L, R in splits if R for c in letters] + inserts = [L + c + R for L, R in splits for c in letters] + return set(deletes + transposes + replaces + inserts) + + +def edits2(word): + "All edits that are two edits away from `word`." + return (e2 for e1 in edits1(word) for e2 in edits1(e1)) + + +def autocorrectWord(word): + try: + return correction(word) + except: + return word + + +############# stringcleaning + + + + + +def clean(stringstream,autocorrect=False): + + for string in stringstream: + # fixUnicode + string = textacy.preprocess.fix_bad_unicode(string.lower(), normalization=u'NFC') + + # seperate_words_on_regex: + string = " ".join(re.compile(REGEX_SPECIALCHAR).split(string)) #frage ,.?! + + #normalize whitespace + string = textacy.preprocess.normalize_whitespace(string) + + #remove linebreaks + string = re.sub(r'[\n]', " ", string) + + # replaceRockDots + string = re.sub(r'[ß]', "ss", string) + string = re.sub(r'[ö]', "oe", string) + string = re.sub(r'[ü]', "ue", string) + string = re.sub(r'[ä]', "ae", string) + + # frage autocorrect? + #idee http://lexitron.nectec.or.th/public/COLING-2010_Beijing_China/POSTERS/pdf/POSTERS022.pdf + if autocorrect: + string = " ".join([autocorrectWord(word) for word in string.split()]) + + yield string + + + +def processDictstream(dictstream, funcdict, parser): + """ + + :param dictstream: dict-gen + :param funcdict: + clean_in_meta = { + "Solution":funclist, + ... + } + + :param parser: spacy-parser + :return: dict-gen + """ + for dic in dictstream: + result = {} + for key, value in dic.items(): + + if key in funcdict: + + doc = parser(value) + tokens = [tok for tok in doc] + funclist = funcdict[key] + + tokens = filterTokens(tokens, funclist) + + result[key] = " ".join([tok.lower_ for tok in tokens]) + + + else: + result[key] = value + yield result + +def filterTokens(tokens, funclist): + # in:tokenlist, funclist + # out: tokenlist + for f in funclist: + tokens = list(filter(f, tokens)) + + return tokens + +def removePOS(pos_list): + return lambda tok: tok.pos_ not in pos_list + +################################################################################################## + + +path2wordsdict = FILEPATH + config.get("spellchecking", "pickle_file") + +corpus_de_path = FILEPATH + config.get("de_corpus", "path") + +corpus_en_path = FILEPATH + config.get("en_corpus", "path") + + + + +def cleanCorpus(corpus_path, filter_tokens, clean_in_meta, lang="de", printrandom=10): + + logprint("Clean {0}_corpus at {1}".format(lang, datetime.now())) + + rawCorpus_name = lang + "_raw_ticket" + cleanCorpus_name = lang + "_clean_ticket" + + #load raw corpus and create new one + raw_corpus, parser = load_corpus(corpus_name=rawCorpus_name, corpus_path=corpus_path) + + clean_corpus = textacy.Corpus(parser) + + + ## process and add files to textacy-corpi, + clean_corpus.add_texts( + clean(corpus2Text(raw_corpus)), + processDictstream(corpus2Meta(raw_corpus), clean_in_meta,parser=parser) + ) + + + # leere docs aus corpi kicken + clean_corpus.remove(lambda doc: len(doc) == 0) + + + for i in range(printrandom): + printRandomDoc(clean_corpus) + + + + #save corpus + save_corpus(corpus=clean_corpus, corpus_path=corpus_path, corpus_name=cleanCorpus_name) + + + + return clean_corpus + + + +def main(): + start = time.time() + + WORDS = load_obj(path2wordsdict) + + clean_in_content = [] #frage notwendig? + + + clean_in_meta = { + "Solution": [removePOS(["SPACE"])], + "Subject": [removePOS(["SPACE", "PUNCT"])], + "categoryName": [removePOS(["SPACE", "PUNCT"])] + } + + corpus = cleanCorpus(corpus_de_path, clean_in_content, clean_in_meta, "de",printrandom=5 ) + + end = time.time() + logprint("Time Elapsed Cleaning:{0} min".format((end - start) / 60)) + +if __name__ == "__main__": + main() + + + + diff --git a/cleaning_bsp.txt b/cleaning_bsp.txt new file mode 100644 index 0000000..2edcd64 --- /dev/null +++ b/cleaning_bsp.txt @@ -0,0 +1,24 @@ +Index: 0 + Text: lieber support, ich habe gerade versucht mich mit meiner unicard im firefox browser fuer das service portal zu authentifizieren. das hat vor einigen wochen noch tadelos geklappt und mittlerweile bekomme ich folgende fehlermeldung ich hoffe sie koennen mir weiterhelfen. vielen dank und viele gruesse sascha feldhorst dipl. inform. sascha feldhorst wiss. ang. technische universitaet dortmund maschinenbau lehrstuhl fuer foerder und lagerwesen logistikcampus joseph von fraunhofer str. 2 4 d 44227 dortmund tel. 49 231 755 40 73 fax 49 231 755 47 68 mailto sascha.feldhorst@tu dortmund.de sascha.feldhorst@tu dortmund.de http www.flw.mb.tu dortmund.de www.flw.mb.tu dortmund.de wichtiger hinweis die information in dieser e mail ist vertraulich. sie ist ausschliesslich fuer den adressaten bestimmt. sollten sie nicht der fuer diese e mail bestimmte adressat sein, unterrichten sie bitte den absender und vernichten sie diese mail. vielen dank. unbeschadet der korrespondenz per e mail, sind unsere erklaerungen ausschliesslich final rechtsverbindlich, wenn sie in herkoemmlicher schriftform mit eigenhaendiger unterschrift oder durch uebermittlung eines solchen schriftstuecks per telefax erfolgen. important note the information included in this e mail is confidential. it is solely intended for the recipient. if you are not the intended recipient of this e mail please contact the sender and delete this message. thank you. without prejudice of e mail correspondence, our statements are only legally binding when they are made in the conventional written form with personal signature or when such documents are sent by fax. + categoryName: betrieb + +Index: 0 + Text: support browser service portal mittlerweile + categoryName: betrieb + + + + + + + + + + + Index: 1 + Text: telefon umzug antragsteller astrid gramm astrid.gramm@tu dortmund.de terminvorschlag 14.08.2015 einrichtung dezernat 2 abteilung 2.5 psp element uniaccount mnichofm hofmann, nicole gebaeude dezernat 5 raum id 201 651430 telefondose neztwerkdose dt04.5 04.6 telefonnr. 4821 eintrag telefonbuch e mail astrid.gramm@tu dortmund.de voicemail ansprechpartner astrid gramm tel. ansprechpartner 5444 verantwortlicher nutzer type bemerkung frau hofmann wird am 14.08.2015 in die wd 2 umziehen. es ist der raum 201a im og nicht 201 eine bezeichnung der telefondose ist nicht vorhanden. + categoryName: elektronisches telefonbuch + + Index: 1 + Text: telefon umzug antragsteller gramm einrichtung dezernat abteilung element gebaeude dezernat raum id eintrag telefonbuch mail ansprechpartner gramm ansprechpartner verantwortlicher nutzer type bemerkung raum bezeichnung + categoryName: elektronisches telefonbuch \ No newline at end of file diff --git a/config.ini b/config.ini index 5e99a06..39dc5d6 100644 --- a/config.ini +++ b/config.ini @@ -41,10 +41,8 @@ filename=topicModelTickets.log [de_corpus] -#input=M42-Export/Tickets_med.csv -#input=M42-Export/Tickets_small.csv -#input=M42-Export/Tickets_mini.csv -input=M42-Export/de_tickets.csv +input=M42-Export/Tickets_small.csv +#input=M42-Export/de_tickets.csv path=corpi/ diff --git a/corporization.py b/corporization.py index 64e4c47..9b7c837 100644 --- a/corporization.py +++ b/corporization.py @@ -97,6 +97,7 @@ metaliste = [ content_collumn_name = config.get("tickets","content_collumn_name") metaliste = list(map(normalize_whitespace,config.get("tickets","metaliste").split(","))) + path2de_csv = FILEPATH + config.get("de_corpus","input") corpus_de_path = FILEPATH + config.get("de_corpus", "path") @@ -114,7 +115,7 @@ def ticketcsv2Corpus(path2_csv, corpus_path, content_collumn_name, metaliste, la path_csv_split = path2_csv.split("/") filename = path_csv_split[len(path_csv_split) - 1] - printlog("Corporization of {0} at {1}".format(filename,datetime.now())) + logprint("Corporization of {0} at {1}".format(filename, datetime.now())) raw_corpus = textacy.Corpus(lang) @@ -139,7 +140,7 @@ def ticketcsv2Corpus(path2_csv, corpus_path, content_collumn_name, metaliste, la # save corpus raw_name = lang + "_raw_ticket" save_corpus(corpus=raw_corpus, corpus_path=corpus_path, corpus_name=raw_name) - printlog("Done") + logprint("Done") def main(): @@ -152,7 +153,7 @@ def main(): end = time.time() - printlog("Time Elapsed Corporization:{0} min".format((end - start) / 60)) + logprint("Time Elapsed Corporization:{0} min".format((end - start) / 60)) if __name__ == "__main__": diff --git a/init.py b/init.py index 71c28b2..4a23069 100644 --- a/init.py +++ b/init.py @@ -272,47 +272,47 @@ path2stopwordlist_en = FILEPATH + config.get("en_stopwords","pickle_file") def main(): start = time.time() - printlog("Init: {0}".format(datetime.now())) + logprint("Init: {0}".format(datetime.now())) - printlog("create and save lemma_dict") + logprint("create and save lemma_dict") lemma_dict = create_lemma_dict(path2lemma_file) save_obj(lemma_dict, path2lemmadict) - printlog("Build and save Wordlist for Spellchecking") + logprint("Build and save Wordlist for Spellchecking") words = build_words_for_spellchecking(path2words_file) save_obj(words, path2wordlist) - printlog("Build and save Thesaurus") + logprint("Build and save Thesaurus") thesaurus = build_thesaurus_dict(path2wordnet) save_obj(thesaurus, path2thesaurus_dict) - printlog("Build and save stoppwortliste") + logprint("Build and save stoppwortliste") de_stop_words, en_stop_words = create_stopword_lists(stop1, stop2, stop3) save_obj(de_stop_words, path2stopwordlist_de) save_obj(en_stop_words, path2stopwordlist_en) - printlog("Build and save nomenliste") + logprint("Build and save nomenliste") nouns = list_from_files(nouns1,nouns2) save_obj(nouns, path2nouns_list) - printlog("Build and save firstnameslist") + logprint("Build and save firstnameslist") vornamen = list_from_files(firstnames_txt) save_obj(vornamen, path2firstnameslist) end = time.time() - printlog("Time Elapsed Initialization:{0} min".format((end - start) / 60)) + logprint("Time Elapsed Initialization:{0} min".format((end - start) / 60)) diff --git a/java_LabledLDA/models/tickets/.others.gz b/java_LabledLDA/models/tickets/.others.gz index 48c3c1b..d52733e 100644 Binary files a/java_LabledLDA/models/tickets/.others.gz and b/java_LabledLDA/models/tickets/.others.gz differ diff --git a/java_LabledLDA/models/tickets/.tassign.gz b/java_LabledLDA/models/tickets/.tassign.gz index f815b2d..92186a2 100644 Binary files a/java_LabledLDA/models/tickets/.tassign.gz and b/java_LabledLDA/models/tickets/.tassign.gz differ diff --git a/java_LabledLDA/models/tickets/.theta.gz b/java_LabledLDA/models/tickets/.theta.gz index 8d1f466..3a4ccbe 100644 Binary files a/java_LabledLDA/models/tickets/.theta.gz and b/java_LabledLDA/models/tickets/.theta.gz differ diff --git a/java_LabledLDA/models/tickets/.twords.gz b/java_LabledLDA/models/tickets/.twords.gz index df07efe..80d115c 100644 Binary files a/java_LabledLDA/models/tickets/.twords.gz and b/java_LabledLDA/models/tickets/.twords.gz differ diff --git a/java_LabledLDA/models/tickets/.wordmap.gz b/java_LabledLDA/models/tickets/.wordmap.gz index 6051dd9..c05f690 100644 Binary files a/java_LabledLDA/models/tickets/.wordmap.gz and b/java_LabledLDA/models/tickets/.wordmap.gz differ diff --git a/java_LabledLDA/models/tickets/tickets.gz b/java_LabledLDA/models/tickets/tickets.gz index 0516f9a..3a08de8 100644 Binary files a/java_LabledLDA/models/tickets/tickets.gz and b/java_LabledLDA/models/tickets/tickets.gz differ diff --git a/main.py b/main.py index 0cdd6ca..d232bd8 100644 --- a/main.py +++ b/main.py @@ -1,29 +1,35 @@ # -*- coding: utf-8 -*- import time -import init import corporization import preprocessing import topicModeling +import cleaning from miscellaneous import * # ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/main.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_main.log &" start = time.time() +import init init.main() -printlog("") +logprint("") corporization.main() -printlog("") +logprint("") + +cleaning.main() +logprint("") preprocessing.main() -printlog("") +logprint("") -topicModeling.main() -printlog("") +topicModeling.main(use_raw=False) +logprint("") +topicModeling.main(use_raw=True) +logprint("") end = time.time() -printlog("Total Time Elapsed: {0} min".format((end - start) / 60)) +logprint("Total Time Elapsed: {0} min".format((end - start) / 60)) diff --git a/miscellaneous.py b/miscellaneous.py index d1a3fa6..ab4357f 100644 --- a/miscellaneous.py +++ b/miscellaneous.py @@ -12,6 +12,10 @@ import spacy import textacy from scipy import * import os +import glob, os +from textacy.fileio import open_sesame +import json +from spacy.tokens.doc import Doc as SpacyDoc csv.field_size_limit(sys.maxsize) FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/" @@ -40,7 +44,7 @@ logging.basicConfig(filename=filename, level=level) -def printlog(string, level="INFO"): +def logprint(string, level="INFO"): """log and prints""" print(string) if level == "INFO": @@ -91,6 +95,7 @@ def load_obj(path): with open(path, 'rb') as f: return pickle.load(f) + def replaceRockDots(): return lambda string: re.sub(r'[ß]', "ss", (re.sub(r'[ö]', "oe", @@ -117,7 +122,19 @@ def list_from_files(*paths): return list(map(textacy.preprocess.normalize_whitespace, liste)) +def deprecated(func): + """This is a decorator which can be used to mark functions + as deprecated. It will result in a warning being emmitted + when the function is used.""" + @functools.wraps(func) + def new_func(*args, **kwargs): + warnings.simplefilter('always', DeprecationWarning) #turn off filter + warnings.warn("Call to deprecated function {}.".format(func.__name__), category=DeprecationWarning, stacklevel=2) + warnings.simplefilter('default', DeprecationWarning) #reset filter + return func(*args, **kwargs) + + return new_func def printRandomDoc(textacyCorpus): @@ -127,17 +144,26 @@ def printRandomDoc(textacyCorpus): """ print() if len(textacyCorpus) == 0: - printlog("NO DOCS IN CORPUS") + logprint("NO DOCS IN CORPUS") else: - printlog("len(textacyCorpus) = %i" % len(textacyCorpus)) + #printlog("len(textacyCorpus) = %i" % len(textacyCorpus)) randIndex = int((len(textacyCorpus) - 1) * random.random()) - printlog("Index: {0} \n Text: {1} \n Metadata: {2}\n".format(randIndex, textacyCorpus[randIndex].text, - textacyCorpus[randIndex].metadata['categoryName'])) + logprint("Index: {0} \n Text: {1} \n categoryName: {2}\n".format(randIndex, textacyCorpus[randIndex].text, + textacyCorpus[randIndex].metadata['categoryName'])) print() +def corpus2Text(corpus): + for doc in corpus: + yield doc.text +def corpus2Meta(corpus): + for doc in corpus: + yield doc.metadata + +def saveplaincorpustext(corpus,path): + textacy.fileio.write_file_lines(corpus2Text(corpus),filepath=path ) @@ -163,10 +189,16 @@ def save_corpus(corpus, corpus_path, corpus_name): parserpath = corpus_path + str(parser.lang) + '_parser' parser.save_to_directory(parserpath) + ## + # save content contentpath = corpus_path + corpus_name + "_content.bin" textacy.fileio.write_spacy_docs((doc.spacy_doc for doc in corpus), contentpath) + #save plain content + plainpath = corpus_path + corpus_name + "_content.json" + textacy.fileio.write_json_lines(({"index" : doc.corpus_index, "content" : doc.text} for doc in corpus), plainpath) + # save meta metapath = corpus_path + corpus_name + "_meta.json" textacy.fileio.write_json_lines((doc.metadata for doc in corpus), metapath) @@ -175,6 +207,7 @@ def save_corpus(corpus, corpus_path, corpus_name): + def load_corpus(corpus_path, corpus_name, lang="de"): """ Load textacy-Corpus including spacy-parser out from file @@ -207,16 +240,115 @@ def load_corpus(corpus_path, corpus_name, lang="de"): contentpath = corpus_path + corpus_name + "_content.bin" + plainpath = corpus_path + corpus_name + "_content.json" metapath = corpus_path + corpus_name + "_meta.json" + try: + spacy_docs = textacy.fileio.read_spacy_docs(corpus.spacy_vocab, contentpath) + metadata_stream = textacy.fileio.read_json_lines(metapath) - metadata_stream = textacy.fileio.read_json_lines(metapath) - spacy_docs = textacy.fileio.read_spacy_docs(corpus.spacy_vocab, contentpath) - for spacy_doc, metadata in zip(spacy_docs, metadata_stream): - corpus.add_doc( + for spacy_doc, metadata in zip(spacy_docs, metadata_stream): + corpus.add_doc( textacy.Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata)) + except: + # neu init!! + corpus = textacy.Corpus(parser) + + plain_stream = textacy.fileio.read_json_lines(plainpath) # yields {int : str} + metadata_stream = textacy.fileio.read_json_lines(metapath) + + for plain, metadata in zip(plain_stream, metadata_stream): + corpus.add_doc( + textacy.Doc(plain["content"], lang=corpus.spacy_lang, metadata=metadata)) + + return corpus, corpus.spacy_lang + +def save_corpusV2(corpus, corpus_path, corpus_name): + """ + saves a textacy-corpus including spacy-parser + :param corpus: textacy-Corpus + :param corpus_path: str + :param corpus_name: str (should content the language like "_de_") + """ + + # save parser + parser = corpus.spacy_lang + parserpath = corpus_path + str(parser.lang) + '_parser' + parser.save_to_directory(parserpath) + + + contentpath = corpus_path +corpus_name + "_docs/" + if not os.path.exists(contentpath): + os.makedirs(contentpath) + + for doc in corpus: + with open(contentpath + str(doc.corpus_index) + "_doc.bin", 'w') as f: + f.write(doc.spacy_doc.to_bytes()) + with open(contentpath + str(doc.corpus_index) + "_meta.json", 'w') as file: + file.write(json.dumps(doc.metadata)) + +def load_corpusV2(corpus_path, corpus_name, lang="de"): + """ + Load textacy-Corpus including spacy-parser out from file + :param corpus_path: str + :param corpus_name: str (should content the language like "_de_") + :param lang: str (language code) ir spacy.Language + :return: texracy.Corpus, spacy.language + """ + + # ckeck for language + if "de_" in corpus_name: + lang = "de" + elif "en_" in corpus_name: + lang = "en" + + # load parser + parser = spacy.load(lang) + + stringstorepath = corpus_path + str(lang) + '_parser' + '/vocab/strings.json' + with open(stringstorepath) as file: + parser.vocab.strings.load(file) + + vocabpath = Path(corpus_path + str(lang) + '_parser' + '/vocab/lexemes.bin') + parser.vocab.load_lexemes(vocabpath) + + # load corpus + corpus = textacy.Corpus(parser) + + contentpath = corpus_path + corpus_name + "_docs/" + docs = yield_fromdir(contentpath,spacy_vocab=corpus.spacy_vocab,type="doc") + metas = yield_fromdir(contentpath,type="meta") + + for doc,meta in zip(docs,metas): + corpus.add_doc( + textacy.Doc(doc, lang=corpus.spacy_lang, metadata=meta)) + + + return corpus, corpus.spacy_lang + +def yield_fromdir(path,spacy_vocab=None,type=".pkl"): + os.chdir(path) + filelist = [name for name in os.listdir('.') if os.path.isfile(name)] + filelist = [filename for filename in filelist if type in filename] + filelist.sort(key = lambda elem : elem.split("_")[0]) + + + if type =='doc': + for filename in filelist: + with open(path+filename,'r') as f: + for bytes_string in SpacyDoc.read_bytes(f): + yield SpacyDoc(spacy_vocab).from_bytes(bytes_string) + elif type == 'meta': + for filename in filelist: + with open(path+filename,'r') as f: + yield json.load(f) + else: + for filename in filelist: + yield load_obj(path+filename) + + diff --git a/nomen.txt b/nomen.txt index 3793ade..e3c6433 100644 --- a/nomen.txt +++ b/nomen.txt @@ -1,3 +1,5 @@ +unicard +uniaccount kernspaltung kernfission atomspaltung diff --git a/preprocessing.py b/preprocessing.py index 26e755e..77232ee 100644 --- a/preprocessing.py +++ b/preprocessing.py @@ -27,12 +27,6 @@ with open(config_ini) as f: global REGEX_SPECIALCHAR global REGEX_TOPLVL -REGEX_SPECIALCHAR = r'[`\-=~%^&*()_+\[\]{};\'\\:"|]' -REGEX_TOPLVL = r'\.[a-z]{2,3}(\.[a-z]{2,3})?' - - - - global THESAURUS global WORDS global LEMMAS @@ -41,6 +35,10 @@ global VORNAMEN global DE_STOP_WORDS global EN_STOP_WORDS +REGEX_SPECIALCHAR = r'[`\-=~%^&*()_+\[\]{};\'\\:"|]' #+r',.' +REGEX_TOPLVL = r'\.[a-z]{2,3}(\.[a-z]{2,3})?' + + THESAURUS = {} WORDS= {} LEMMAS= {} @@ -185,7 +183,7 @@ def autocorrectWord(word): ############# stringcleaning - +@deprecated def stringcleaning(stringstream): @@ -225,7 +223,6 @@ def stringcleaning(stringstream): - def filterTokens(tokens, funclist): # in:tokenlist, funclist # out: tokenlist @@ -257,20 +254,6 @@ def processContentstream2(textstream, parser, token_filterlist=None): def preparse(stringstream): for string in stringstream: - # fixUnicode - string = textacy.preprocess.fix_bad_unicode(string.lower(), normalization=u'NFC') - - # seperate_words_on_regex: - string = " ".join(re.compile(REGEX_SPECIALCHAR).split(string)) - - #normalize whitespace - string = textacy.preprocess.normalize_whitespace(string) - - # replaceRockDots - string = re.sub(r'[ß]', "ss", string) - string = re.sub(r'[ö]', "oe", string) - string = re.sub(r'[ü]', "ue", string) - string = re.sub(r'[ä]', "ae", string) # cut_after # todo addressen enfernen --> postal.parser idee zu metadaten hinzufügen @@ -312,6 +295,7 @@ def corpus2Meta(corpus): for doc in corpus: yield doc.metadata +@deprecated def processContentstream(textstream, parser, token_filterlist=None): """ :param textstream: string-gen @@ -398,21 +382,22 @@ corpus_en_path = FILEPATH + config.get("en_corpus", "path") def preprocessCorpus(corpus_path, filter_tokens, clean_in_meta, lang="de", printrandom=10): - printlog("Preprocess {0}_corpus at {1}".format(lang,datetime.now())) + logprint("Preprocess {0}_corpus at {1}".format(lang, datetime.now())) - rawCorpus_name = lang + "_raw_ticket" + cleanCorpus_name = lang + "_clean_ticket" preCorpus_name = lang + "_pre_ticket" + logprint("Load {0}_raw".format(lang)) #load raw corpus and create new one - raw_corpus, parser = load_corpus(corpus_name=rawCorpus_name, corpus_path=corpus_path) + clean_corpus, parser = load_corpus(corpus_name=cleanCorpus_name, corpus_path=corpus_path) corpus = textacy.Corpus(parser) ## process and add files to textacy-corpi, corpus.add_texts( - processContentstream2(corpus2Text(raw_corpus), token_filterlist=filter_tokens, parser=parser), - processDictstream(corpus2Meta(raw_corpus), clean_in_meta,parser=parser) + processContentstream2(corpus2Text(clean_corpus), token_filterlist=filter_tokens, parser=parser), + processDictstream(corpus2Meta(clean_corpus), clean_in_meta,parser=parser) ) @@ -429,6 +414,16 @@ def preprocessCorpus(corpus_path, filter_tokens, clean_in_meta, lang="de", print save_corpus(corpus=corpus, corpus_path=corpus_path, corpus_name=preCorpus_name) + #save corpus as labled, plain text + plainpath = FILEPATH + config.get("de_corpus", "path") + "labled_lines.txt" + textacy.fileio.write_file_lines(labledCorpiLines(corpus),filepath=plainpath ) + + return corpus + +def labledCorpiLines(corpus): + for doc in corpus: + # generate [topic1, topic2....] tok1 tok2 tok3 out of corpi + yield "[" + doc.metadata["categoryName"] + "] " + doc.text def main(): @@ -468,12 +463,16 @@ def main(): "categoryName": [removePOS(["SPACE", "PUNCT"])] } - preprocessCorpus(corpus_de_path, filter_tokens, clean_in_meta, "de" ) + corpus = preprocessCorpus(corpus_de_path, filter_tokens, clean_in_meta, "de",printrandom=5) + + #from topicModeling import jgibbsLLDA + + #jgibbsLLDA(corpus) #preprocessCorpus(corpus_en_path, filter_tokens, clean_in_meta, "en" ) end = time.time() - printlog("Time Elapsed Preprocessing:{0} min".format((end - start) / 60)) + logprint("Time Elapsed Preprocessing:{0} min".format((end - start) / 60)) if __name__ == "__main__": main() diff --git a/testra.py b/testra.py index d1fc357..62007e9 100644 --- a/testra.py +++ b/testra.py @@ -15,11 +15,26 @@ start = time.time() import enchant from datetime import datetime - +import os import xml.etree.ElementTree as ET +FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/" +from miscellaneous import * + +# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/topicModeling.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_topicModeling.log &" + +parser = spacy.load("de") + """ +# load config +config_ini = FILEPATH + "config.ini" + +config = ConfigParser.ConfigParser() +with open(config_ini) as f: + config.read_file(f) + + PARSER=spacy.load("de") @@ -48,13 +63,74 @@ def makemeta( testmetda): yield metdata +def corpus2Text(corpus): + for doc in corpus: + yield doc.text + corpi.add_texts( makecontent(testcontetn), makemeta(testmetda) ) - +corpus_de_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/test/" +rawCorpus_name = "de_test_ticket" print(corpi) + +#save_corpusV2(corpi,corpus_path=corpus_de_path,corpus_name=rawCorpus_name) + +#textacy.fileio.write_file_lines(corpus2Text(corpi), filepath=corpus_de_path+"plain.txt") + + +dict = {"unicard redaktionsteam": 189, "kms": 131, "itmc_st\u00f6rungen": 17, "benutzerverwaltung_probleme": 168, "mailverteiler exchange": 130, "beamer": 70, "cws_confluence": 190, "benutzerverwaltung": 26, "sos": 166, "virtuelle server": 116, "sap": 7, "wlan": 21, "lsf": 6, "gastaufenthalt": 8, "umzug": 5, "firewall betreuung": 129, "ausleihe": 39, "fiona": 10, "kursplanung": 195, "schulungsraum verwaltung": 200, "plagiatserkennung": 32, "designentwicklung": 100, "ub basis it": 184, "tsm": 51, "backup tsm": 110, "raumkalender": 174, "veeam": 149, "linux bs": 42, "hochleistungsrechnen": 90, "e learning": 37, "h\u00f6rsaal\u00fcbertragung": 52, "sophos": 88, "service portal redaktion": 182, "verkauf": 93, "fk 16": 30, "campus app": 54, "dns": 71, "kurse": 196, "itmc schulungsr\u00e4ume": 96, "leitung": 91, "telefon": 14, "housing": 135, "softwarelizenzen": 35, "hcm stammdaten": 68, "semesterticket": 197, "exchange nutzung": 33, "mediendienste": 167, "sam spider": 172, "pvp": 27, "webserver": 29, "werkvertr\u00e4ge": 158, "ibz raumbuchung": 177, "webmailer": 126, "unicard sperrung": 64, "cd dvd produktion": 114, "lizenzserver": 92, "pr\u00fcfungsmanagement": 38, "blogs wikis foren": 87, "unicard ausgabe": 161, "pools": 157, "desktop & basisdienste": 144, "antrag auf rechnungserstellung": 193, "mailalias": 121, "evaexam": 133, "neuanschluss": 0, "mobilfunkvertr\u00e4ge": 69, "ftp server": 191, "haustechnik": 77, "raumbuchungssysteme": 186, "confluence": 181, "uniaccount zugangsdaten": 47, "itmc medienr\u00e4ume ef50": 171, "dokoll support": 128, "elektronisches telefonbuch": 3, "softwareverteilung": 153, "overhead projektor": 104, "sicherheit": 145, "itmc_als": 48, "itmc pools": 160, "zhb": 60, "serversupport": 101, "veranstaltungen": 61, "fk12 webauftritt": 138, "hardware": 142, "unicard produktion": 156, "telefonkonferenzen": 170, "dhcp": 188, "zertifikate server dfn": 139, "lan": 1, "datanet": 49, "neuausstattung": 173, "moodle": 16, "abmeldung": 13, "uni mail": 15, "medienr\u00e4ume ef50": 117, "verschiedene aufgaben": 40, "zentrale webserver": 75, "vorlesungsaufzeichnung": 152, "grafik": 132, "campus management": 72, "hacker angriff": 46, "pos": 23, "zugangsdaten": 41, "serviceportal": 63, "ews": 24, "voicemail box": 150, "service desk itmc": 74, "test": 180, "beschaffung": 57, "bestellung": 185, "vpn": 55, "app feedback": 66, "allgemein": 134, "rundmail": 105, "telefonabrechnung": 199, "limesurvey": 31, "unicard": 28, "eldorado": 140, "uniaccount": 12, "plotter": 125, "mdm mobile device management": 120, "namens\u00e4nderung": 43, "sd": 84, "basis applikationen": 103, "\u00e4nderung": 194, "fileserver einrichtung": 187, "fk14_test": 154, "werkst\u00e4tte": 179, "itmc_aufgaben": 45, "formulare antr\u00e4ge": 81, "facility": 192, "web": 169, "asknet": 136, "server storage": 113, "mail groupware": 20, "rektorat -b\u00fcro": 178, "office": 50, "werkstoffe lehrstuhl bauwesen": 59, "telefonzentrale": 115, "verwaltung": 4, "netze": 22, "beantragung": 82, "d.3 dms": 148, "redmine projektverwaltung": 141, "wsus": 106, "lido": 118, "rechnerr\u00e4ume": 143, "matrix42_hilfe": 18, "boss service desk": 44, "konteneinsicht": 62, "spam phishing": 53, "forensic": 164, "fk 12": 11, "benutzungsverwaltung": 198, "redmine": 79, "basis app": 85, "viren": 95, "fk12 migration": 155, "raumbuchung": 109, "virtuelle desktops citrix": 176, "outlook_einrichtung": 123, "kundenserver": 137, "nrw ticket": 80, "weiterentwicklung": 127, "siport zugangskontrolle": 98, "e mail dienste": 99, "vorlagenerstellung": 36, "video": 19, "studierendensekretariat": 111, "it sicherheit sic": 86, "boss": 25, "technik": 58, "dokoll pvp": 112, "betrieb": 2, "v2 campus app feedback": 151, "mailverteiler": 108, "videoschnitt": 119, "fk raumplanung 09": 9, "sap urlaub": 73, "keine r\u00fcckantwort": 124, "prozess- und projektmanagement": 67, "dienstreise": 34, "webgestaltung": 78, "schulung": 175, "software": 89, "medientechnik": 76, "servicedesk": 107, "service portal": 94, "software entwicklung": 165, "uniflow": 159, "ub_st\u00f6rungen": 162, "fk15": 183, "uhren": 83, "entwicklung": 163, "videokonferenzen": 97, "itmc webauftritt": 102, "joomla itmc website": 147, "changes": 122, "visitenkartenproduktion": 65, "lizenzmanagement": 146, "tonerb\u00f6rse": 201, "arbeitsplatzsupport": 56} + +list = [(key,value) for key,value in dict.items()] + +list.sort(key=lambda tup : tup[1]) """ +""" +from spacy.tokens.doc import Doc as SpacyDoc + +filepath = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/de_clean_ticket_content.bin" + +# load parser +parser = spacy.load("de") + +corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/" + +stringstorepath = corpus_path + 'de_parser/vocab/strings.json' +with open(stringstorepath) as file: + parser.vocab.strings.load(file) + +vocabpath = Path(corpus_path + 'de_parser/vocab/lexemes.bin') +parser.vocab.load_lexemes(vocabpath) + +spacy_vocab = parser.vocab + +def readCorpus(filepath): + with open_sesame(filepath, mode='rb') as f: + for bytes_string in SpacyDoc.read_bytes(f): + yield SpacyDoc(spacy_vocab).from_bytes(bytes_string).text + + +textacy.fileio.write_file_lines(readCorpus(filepath),"/home/jannis.grundmann/PycharmProjects/topicModelingTickets/result.txt") +""" + + + +# load raw corpus and create new one +#raw_corpus, parser = load_corpusV2(corpus_name=rawCorpus_name, corpus_path=corpus_de_path) + +#printRandomDoc(raw_corpus) + + +""" +spacy_doc = PARSER("test") +save_obj(spacy_doc, "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/doc.pkl") + +spacy_doc2 = load_obj("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/doc.pkl") + +print("Doc: {0}".format(spacy_doc2)) + + jgibbsLLDA_root = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/" @@ -63,6 +139,7 @@ laveldict = {'fiona': 10, 'vorlagenerstellung': 36, 'webserver': 29, 'matrix42_h with open(LLDA_filepath, 'w') as file: file.write(json.dumps(laveldict)) """ +""" def load_corpus(corpus_path, corpus_name, lang="de"): from pathlib import Path diff --git a/topicModeling.py b/topicModeling.py index cd35eac..5c2e01b 100644 --- a/topicModeling.py +++ b/topicModeling.py @@ -31,13 +31,21 @@ with open(config_ini) as f: config.read_file(f) +def label2ID(label, labeldict): + return labeldict.get(label, len(labeldict)) -def printvecotorization(de_corpus,ngrams=1, min_df=1, max_df=1.0, weighting='tf', named_entities=True): - printlog(str("ngrams: {0}".format(ngrams))) - printlog(str("min_df: {0}".format(min_df))) - printlog(str("max_df: {0}".format(max_df))) - printlog(str("named_entities: {0}".format(named_entities))) +def generate_labled_lines(textacyCorpus, labeldict): + for doc in textacyCorpus: + # generate [topic1, topic2....] tok1 tok2 tok3 out of corpi + yield "[" + str(label2ID(doc.metadata["categoryName"], labeldict)) + "] " + doc.text + + +def printvecotorization(de_corpus, ngrams=1, min_df=1, max_df=1.0, weighting='tf', named_entities=True): + logprint(str("ngrams: {0}".format(ngrams))) + logprint(str("min_df: {0}".format(min_df))) + logprint(str("max_df: {0}".format(max_df))) + logprint(str("named_entities: {0}".format(named_entities))) # printlog("vectorize corpi...") vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df) @@ -48,19 +56,20 @@ def printvecotorization(de_corpus,ngrams=1, min_df=1, max_df=1.0, weighting='tf' for t in terms_list: print(t) - printlog("doc_term_matrix: {0}".format(doc_term_matrix)) - printlog("id2term: {0}".format(id2term)) + logprint("doc_term_matrix: {0}".format(doc_term_matrix)) + logprint("id2term: {0}".format(id2term)) -def textacyTopicModeling(ngrams, min_df, max_df, corpus, n_topics, topicModel='lda',named_entities=False): - printlog( + +def textacyTopicModeling(ngrams, min_df, max_df, corpus, n_topics, topicModel='lda', named_entities=False): + logprint( "############################################ Topic Modeling {0} #############################################".format( topicModel)) print("\n\n") - printlog(str("ngrams: {0}".format(ngrams))) - printlog(str("min_df: {0}".format(min_df))) - printlog(str("max_df: {0}".format(max_df))) - printlog(str("n_topics: {0}".format(n_topics))) - printlog(str("named_entities: {0}".format(named_entities))) + logprint(str("ngrams: {0}".format(ngrams))) + logprint(str("min_df: {0}".format(min_df))) + logprint(str("max_df: {0}".format(max_df))) + logprint(str("n_topics: {0}".format(n_topics))) + logprint(str("named_entities: {0}".format(named_entities))) start = time.time() @@ -98,13 +107,13 @@ def textacyTopicModeling(ngrams, min_df, max_df, corpus, n_topics, topicModel=' print() for topic_idx, top_terms in model.top_topic_terms(vectorizer.id_to_term, top_n=top_topic_words): - printlog('topic {0}: {1}'.format(topic_idx, " ".join(top_terms))) + logprint('topic {0}: {1}'.format(topic_idx, " ".join(top_terms))) print() for topic_idx, top_docs in model.top_topic_docs(doc_topic_matrix, top_n=top_document_labels_per_topic): - printlog(topic_idx) + logprint(topic_idx) for j in top_docs: - printlog(corpus[j].metadata['categoryName']) + logprint(corpus[j].metadata['categoryName']) print() ##################################################################################################################### @@ -112,100 +121,142 @@ def textacyTopicModeling(ngrams, min_df, max_df, corpus, n_topics, topicModel=' print() end = time.time() - printlog("\n\n\nTime Elapsed Topic Modeling with {1}:{0} min\n\n".format((end - start) / 60, topicModel)) + logprint("\n\n\nTime Elapsed Topic Modeling with {1}:{0} min\n\n".format((end - start) / 60, topicModel)) -def jgibbsLLDA(de_corpus, top_topic_words): +def jgibbsLLDA(corpus, path2save_results, top_topic_words=15, add_default_topic=False): ##################### LLDA Topic Modeling via JGibbsLabledLDA ############################################## start = time.time() - def label2ID(label, labeldict): - return labeldict.get(label, len(labeldict)) - - def generate_labled_lines(textacyCorpus,labeldict): - for doc in textacyCorpus: - # generate [topic1, topic2....] tok1 tok2 tok3 out of corpi - yield "[" + str(label2ID(doc.metadata["categoryName"],labeldict)) + "] " + doc.text - # build citionary of ticketcategories labelist = [] - for texdoc in de_corpus.get(lambda texdoc: texdoc.metadata["categoryName"] not in labelist): + for texdoc in corpus.get(lambda texdoc: texdoc.metadata["categoryName"] not in labelist): labelist.append(texdoc.metadata["categoryName"]) labeldict = {k: v for v, k in enumerate(labelist)} - n_topics = len(labeldict) + 1 # len(set(ticketcorpus[0].metadata.keys()))+1 #+1 wegen einem default-topic + if add_default_topic: + n_topics = len(labeldict) + 1 # len(set(ticketcorpus[0].metadata.keys()))+1 #+1 wegen einem default-topic + else: + n_topics = len(labeldict) # + 1 # len(set(ticketcorpus[0].metadata.keys()))+1 #+1 wegen einem default-topic - jgibbsLLDA_root = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/java_LabledLDA/" + jgibbsLLDA_root = FILEPATH + "/java_LabledLDA/" LLDA_filepath = "{0}models/tickets/tickets.gz".format(jgibbsLLDA_root) dict_path = "{0}models/tickets/labeldict.txt".format(jgibbsLLDA_root) - - #printlog(str("LABELDICT: {0}".format(labeldict))) - printlog(str("LABELDICT-length: {0}".format(len(labeldict)))) + # printlog(str("LABELDICT: {0}".format(labeldict))) + #logprint(str("LABELDICT-length: {0}".format(len(labeldict)))) with open(dict_path, 'w') as file: file.write(json.dumps(labeldict)) - #for line in generate_labled_lines(de_corpus,labeldict): + # for line in generate_labled_lines(de_corpus,labeldict): # print(line) # create file - textacy.fileio.write_file_lines(generate_labled_lines(de_corpus,labeldict), filepath=LLDA_filepath) + textacy.fileio.write_file_lines(generate_labled_lines(corpus, labeldict), filepath=LLDA_filepath) # wait for file to exist while not os.path.exists(LLDA_filepath): time.sleep(1) - """ - printlog("") - printlog("start LLDA:") + + logprint("") + logprint("start LLDA:") # run JGibsslda file FNULL = open(os.devnull, 'w') # supress output - subprocess.call(["java", - "-cp", - "{0}lib/trove-3.0.3.jar:{0}lib/args4j-2.0.6.jar:{0}out/production/LabledLDA/".format( - jgibbsLLDA_root), - "jgibblda.LDA", - "-est", - "-dir", "{0}models/tickets".format(jgibbsLLDA_root), - "-dfile", "tickets.gz", - "-twords", str(top_topic_words), - "-ntopics", str(n_topics)], stdout=FNULL) + cmd_jgibbs_java = ["java", "-cp", + "{0}lib/trove-3.0.3.jar:{0}lib/args4j-2.0.6.jar:{0}out/production/LabledLDA/".format( + jgibbsLLDA_root), + "jgibblda.LDA", "-est", "-dir", "{0}models/tickets".format(jgibbsLLDA_root), "-dfile", + "tickets.gz", + "-twords", str(top_topic_words), "-ntopics", str(n_topics)] + subprocess.call(cmd_jgibbs_java, stdout=FNULL) # ANMERKUNG: Dateien sind versteckt. zu finden in models/ # twords + """ subprocess.call(["gzip", "-dc", "{0}/models/tickets/.twords.gz".format(jgibbsLLDA_root)]) - ##################################################################################################################### - printlog("") """ + + cmd_gzip = ["gzip", "-dc", "{0}/models/tickets/.twords.gz".format(jgibbsLLDA_root)] + """ + proc = subprocess.Popen(cmd_gzip, stdout=subprocess.PIPE) + + process = subprocess.Popen(cmd_gzip, shell=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + + # wait for the process to terminate + out, err = process.communicate() + errcode = process.returncode + + result = subprocess.check_output(cmd_gzip) + + #result = proc.stdout.read() + result = proc.communicate() + out=[] + for line in result: + out.append(line) + """ + + output = subprocess.check_output(cmd_gzip).decode("utf-8") + + reverse_labeldict = {v: k for k, v in labeldict.items()} + result = [] + regex = re.compile(r'Topic [0-9]') + for line in output.splitlines(): + + findall = regex.findall(line) + if len(findall) != 0: + try: + index = int(findall[0].split()[1]) + result.append("Topic {} {}:".format(index, reverse_labeldict[index])) + + except: + result.append(line) + + else: + result.append(line) + + textacy.fileio.write_file_lines(result, path2save_results) + ##################################################################################################################### + logprint("") + end = time.time() - printlog("\n\n\nTime Elapsed Topic Modeling JGibbsLLDA:{0} min\n\n".format((end - start) / 60)) + logprint("\n\n\nTime Elapsed Topic Modeling JGibbsLLDA:{0} min\n\n".format((end - start) / 60)) - -def main(): - - printlog("Topic Modeling: {0}".format(datetime.now())) +def main(use_raw=False): + logprint("Topic Modeling: {0}".format(datetime.now())) corpus_de_path = FILEPATH + config.get("de_corpus", "path") corpus_en_path = FILEPATH + config.get("en_corpus", "path") + if use_raw: + preCorpus_name = "de" + "_raw_ticket" + else: + preCorpus_name = "de" + "_pre_ticket" - preCorpus_name = "de" + "_pre_ticket" - - #load raw corpus and create new one + # load raw corpus and create new one de_corpus, parser = load_corpus(corpus_name=preCorpus_name, corpus_path=corpus_de_path) - printlog("Corpus loaded: {0}".format(de_corpus.lang)) + logprint("Corpus loaded: {0}".format(de_corpus.lang)) - #idee http://bigartm.org/ - #idee http://wiki.languagetool.org/tips-and-tricks + # idee http://bigartm.org/ + # idee http://wiki.languagetool.org/tips-and-tricks + # idee https://en.wikipedia.org/wiki/Noisy_text_analytics + # idee https://gate.ac.uk/family/ # todo gescheites tf(-idf) maß finden + # todo pro model: gelabelten corpus, ergebnisse und labeldict speichern + # todo topics zusammenfassen + # frage wieviele tickets pro topic? + + ngrams = 1 min_df = 1 max_df = 1.0 @@ -213,19 +264,44 @@ def main(): # weighting ='tfidf' named_entities = False - """ printvecotorization(ngrams=1, min_df=1, max_df=1.0, weighting=weighting) printvecotorization(ngrams=1, min_df=1, max_df=0.5, weighting=weighting) printvecotorization(ngrams=1, min_df=1, max_df=0.8, weighting=weighting) - + printvecotorization(ngrams=(1, 2), min_df=1, max_df=1.0, weighting=weighting) printvecotorization(ngrams=(1, 2), min_df=1, max_df=0.5, weighting=weighting) printvecotorization(ngrams=(1, 2), min_df=1, max_df=0.8, weighting=weighting) """ + if use_raw: + resultspath = FILEPATH + "results/raw" + else: + resultspath = FILEPATH + "results/pre" - jgibbsLLDA(de_corpus,15) + top_topic_words = 5 + add_default_topic = False + path2save_results = resultspath + "{}_{}.txt".format(top_topic_words, add_default_topic) + jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words, + add_default_topic=add_default_topic) + + top_topic_words = 5 + add_default_topic = True + path2save_results = resultspath + "{}_{}.txt".format(top_topic_words, add_default_topic) + jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words, + add_default_topic=add_default_topic) + + top_topic_words = 10 + add_default_topic = False + path2save_results = resultspath + "{}_{}.txt".format(top_topic_words, add_default_topic) + jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words, + add_default_topic=add_default_topic) + + top_topic_words = 10 + add_default_topic = True + path2save_results = resultspath + "{}_{}.txt".format(top_topic_words, add_default_topic) + jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words, + add_default_topic=add_default_topic) # no_below = 20 # no_above = 0.5 @@ -242,45 +318,44 @@ def main(): topicModel = 'lda', n_topics = len(LABELDICT), corpi=de_corpus) - + topicModeling(ngrams = 1, min_df = 0.1, max_df = 0.6, topicModel = 'lda', n_topics = len(LABELDICT), corpi=de_corpus) - + topicModeling(ngrams = (1,2), min_df = 1, max_df = 1.0, topicModel = 'lda', n_topics = len(LABELDICT), corpi=de_corpus) - + topicModeling(ngrams = (1,2), min_df = 0.1, max_df = 0.6, topicModel = 'lda', n_topics = len(LABELDICT), corpi=de_corpus) - + topicModeling(ngrams = (1,2), min_df = 0.2, max_df = 0.8, topicModel = 'lda', n_topics = 20, corpi=de_corpus) - - - - - - - - """ + + + + + """ + + if __name__ == "__main__": main()