weiter aufgeräumt

This commit is contained in:
jannis.grundmann 2017-10-17 10:13:49 +02:00
parent 56c8bce2d7
commit 17e45c30af
8 changed files with 170 additions and 388 deletions

View File

@ -1,58 +1,53 @@
[thesaurus]
input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/deWordNet.xml
pickle_file = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/thesaurus_dict.pkl
input = deWordNet.xml
pickle_file = thesaurus_dict.pkl
[spellchecking]
input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/deu_news_2015_1M-sentences.txt
pickle_file = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/words_dict.pkl
input = deu_news_2015_1M-sentences.txt
pickle_file = words_dict.pkl
[lemmatization]
input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemmas.txt
pickle_file = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemma_dict.pkl
input = lemmas.txt
pickle_file = lemma_dict.pkl
[nouns]
input1 = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/nomen.txt
input2 = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/nomen2.txt
pickle_file = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/nouns_list.pkl
input1 = nomen.txt
input2 = nomen2.txt
pickle_file = nouns_list.pkl
[firstnames]
input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/firstnames.txt
pickle_file = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/firstnames_list.pkl
input = firstnames.txt
pickle_file = firstnames_list.pkl
[de_stopwords]
input1 = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stopwords_1.txt
input2 = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stopwords_2.txt
input3 = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stopwords_3.txt
pickle_file = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/stopwords_list.pkl
input1 = de_stopwords_1.txt
input2 = de_stopwords_2.txt
input3 = de_stopwords_3.txt
pickle_file = stopwords_list.pkl
[logging]
level = INFO
filename = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/topicModelTickets.log
filename = topicModelTickets.log
[de_corpus]
#input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_med.csv
#input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_small.csv
#input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_mini.csv
input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/de_tickets.csv
path = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/
raw = de_raw_ticket
pre = de_pre_ticket
#input = M42-Export/Tickets_med.csv
#input = M42-Export/Tickets_small.csv
#input = M42-Export/Tickets_mini.csv
input = M42-Export/de_tickets.csv
path = corpi/
[en_corpus]
input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/en_tickets.csv
input = M42-Export/en_tickets.csv
path = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/
raw = en_raw_ticket
pre = en_pre_ticket
path = corpi/

View File

@ -1,53 +1,20 @@
# -*- coding: utf-8 -*-
from datetime import datetime
import time
import logging
from stop_words import get_stop_words
#import words as words
from nltk.corpus import stopwords as nltk_stopwords
from collections import Counter
import csv
import re
import xml.etree.ElementTree as ET
import spacy
import textacy
from scipy import *
import sys
csv.field_size_limit(sys.maxsize)
import pickle
import configparser as ConfigParser
from miscellaneous import *
import time
from datetime import datetime
import logging
from nltk.corpus import stopwords
import csv
import functools
import re
import xml.etree.ElementTree as ET
import spacy
import textacy
from scipy import *
import sys
csv.field_size_limit(sys.maxsize)
import os
csv.field_size_limit(sys.maxsize)
FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/"
# load config
config_ini = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/config.ini"
config_ini = FILEPATH + "config.ini"
config = ConfigParser.ConfigParser()
with open(config_ini) as f:
@ -128,70 +95,61 @@ metaliste = [
content_collumn_name = config.get("tickets","content_collumn_name")
metaliste = config.get("tickets","metaliste")
metaliste = config.get("tickets","metaliste").split(",")
path2de_csv = config.get("de_corpus","input")
corpus_de_path = config.get("de_corpus", "path")
raw_de_name = config.get("de_corpus", "raw")
path2de_csv = FILEPATH + config.get("de_corpus","input")
corpus_de_path = FILEPATH + config.get("de_corpus", "path")
path2en_csv = config.get("en_corpus","input")
corpus_en_path = config.get("en_corpus", "path")
raw_en_name = config.get("en_corpus", "raw")
path2en_csv = FILEPATH + config.get("en_corpus","input")
corpus_en_path = FILEPATH + config.get("en_corpus", "path")
def main():
start = time.time()
printlog("Corporization: {0}".format(datetime.now()))
#print paths
path_csv_split = path2de_csv.split("/")
printlog(path_csv_split[len(path_csv_split) - 1])
path_csv_split = path2en_csv.split("/")
printlog(path_csv_split[len(path_csv_split) - 1])
def ticketcsv2Corpus(path2_csv, corpus_path, content_collumn_name, metaliste, lang, printrandom=0):
DE_PARSER = spacy.load("de")
EN_PARSER = spacy.load("en")
# print paths
path_csv_split = path2_csv.split("/")
filename = path_csv_split[len(path_csv_split) - 1]
raw_de_corpus = textacy.Corpus(DE_PARSER)
raw_en_corpus = textacy.Corpus(EN_PARSER)
printlog("Corporization of {0} at {1}".format(filename,datetime.now()))
raw_corpus = textacy.Corpus(lang)
## add files to textacy-corpi,
printlog("Add texts to textacy-corpi")
printlog("Add texts to {0}_textacy-corpi".format(lang))
raw_de_corpus.add_texts(
ticketcsv_to_textStream(path2de_csv, content_collumn_name),
ticket_csv_to_DictStream(path2de_csv, metaliste)
)
raw_en_corpus.add_texts(
ticketcsv_to_textStream(path2en_csv, content_collumn_name),
ticket_csv_to_DictStream(path2en_csv, metaliste)
raw_corpus.add_texts(
ticketcsv_to_textStream(path2_csv, content_collumn_name),
ticket_csv_to_DictStream(path2_csv, metaliste)
)
# leere docs aus corpi kicken
raw_de_corpus.remove(lambda doc: len(doc) == 0)
raw_en_corpus.remove(lambda doc: len(doc) == 0)
raw_corpus.remove(lambda doc: len(doc) == 0)
#random Doc printen
for i in range(printrandom):
printRandomDoc(raw_corpus)
#for i in range(20):
# printRandomDoc(raw_de_corpus)
# printRandomDoc(raw_en_corpus)
# save corpus
raw_name = lang + "_raw_ticket"
save_corpus(corpus=raw_corpus, corpus_path=corpus_path, corpus_name=raw_name)
#save corpi
save_corpus(corpus=raw_de_corpus, corpus_path=corpus_de_path, corpus_name=raw_de_name)
save_corpus(corpus=raw_en_corpus, corpus_path=corpus_en_path, corpus_name=raw_en_name)
def main():
start = time.time()
ticketcsv2Corpus(path2de_csv,corpus_de_path,content_collumn_name,metaliste,lang="de")
ticketcsv2Corpus(path2en_csv,corpus_en_path,content_collumn_name,metaliste,lang="en")
end = time.time()
printlog("Time Elapsed Corporization:{0} min".format((end - start) / 60))

67
init.py
View File

@ -1,29 +1,24 @@
# -*- coding: utf-8 -*-
from datetime import datetime
import time
import logging
from stop_words import get_stop_words
#import words as words
from nltk.corpus import stopwords as nltk_stopwords
from collections import Counter
import csv
import re
import xml.etree.ElementTree as ET
import spacy
import textacy
from scipy import *
import sys
csv.field_size_limit(sys.maxsize)
import pickle
import configparser as ConfigParser
from miscellaneous import *
from stop_words import get_stop_words
import csv
import sys
import xml.etree.ElementTree as ET
from nltk.corpus import stopwords as nltk_stopwords
from collections import Counter
import time
from datetime import datetime
import os
csv.field_size_limit(sys.maxsize)
FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/"
# load config
config_ini = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/config.ini"
config_ini = FILEPATH + "config.ini"
config = ConfigParser.ConfigParser()
with open(config_ini) as f:
@ -70,7 +65,7 @@ def build_thesaurus_dict(path2wordnet,returnall=False):
Result will be used as lemma_dict["word"] --> lemma
:param path2lexicalentries: str
:param path2wordnet: str
:param returnall: bool if True, also return , word2synsets, synset2Words
:return: dictionaries: thesaurus
"""
@ -242,34 +237,34 @@ def build_words_for_spellchecking(path2words):
##################################################################################################
# THESAURUS
path2wordnet = config.get("thesaurus","input")
path2thesaurus_dict = config.get("thesaurus","pickle_file")
path2wordnet = FILEPATH + config.get("thesaurus","input")
path2thesaurus_dict = FILEPATH + config.get("thesaurus","pickle_file")
# SPELLCHECKING
path2words_file = config.get("spellchecking","input")
path2wordlist = config.get("spellchecking","pickle_file")
path2words_file = FILEPATH + config.get("spellchecking","input")
path2wordlist = FILEPATH + config.get("spellchecking","pickle_file")
# LEMMA
path2lemma_file = config.get("lemmatization","input")
path2lemmadict = config.get("lemmatization","pickle_file")
path2lemma_file = FILEPATH + config.get("lemmatization","input")
path2lemmadict = FILEPATH + config.get("lemmatization","pickle_file")
# NOMEN
nouns1 = config.get("nouns","input1")
nouns2 = config.get("nouns","input2")
path2nouns_list = config.get("nouns","pickle_file")
nouns1 = FILEPATH + config.get("nouns","input1")
nouns2 = FILEPATH + config.get("nouns","input2")
path2nouns_list = FILEPATH + config.get("nouns","pickle_file")
# VORNAMEN
firstnames_txt = config.get("firstnames","input")
path2firstnameslist = config.get("firstnames","pickle_file")
firstnames_txt = FILEPATH + config.get("firstnames","input")
path2firstnameslist = FILEPATH + config.get("firstnames","pickle_file")
# STOPWORDS
stop1 = config.get("de_stopwords","input1")
stop2 = config.get("de_stopwords","input2")
stop3 = config.get("de_stopwords","input3")
path2stopwordlist = config.get("de_stopwords","pickle_file")
stop1 = FILEPATH + config.get("de_stopwords","input1")
stop2 = FILEPATH + config.get("de_stopwords","input2")
stop3 = FILEPATH + config.get("de_stopwords","input3")
path2stopwordlist = FILEPATH + config.get("de_stopwords","pickle_file")

View File

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
import time
import init
import corporization
import preprocessing
@ -8,7 +8,7 @@ from miscellaneous import *
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/main.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_main.log &"
start = time.time()
init.main()
printlog("")
@ -19,3 +19,5 @@ printlog("")
preprocessing.main()
printlog("")
end = time.time()
printlog("Total Time Elapsed: {0} min".format((end - start) / 60))

View File

@ -1,87 +1,25 @@
# -*- coding: utf-8 -*-
import random
import time
from pathlib import Path
from datetime import datetime
import logging
from nltk.corpus import stopwords
import csv
import functools
import re
import xml.etree.ElementTree as ET
import spacy
import textacy
from scipy import *
import sys
from datetime import datetime
import time
start = time.time()
import logging
from nltk.corpus import stopwords
import csv
import functools
import re
import xml.etree.ElementTree as ET
import spacy
import textacy
from scipy import *
import sys
csv.field_size_limit(sys.maxsize)
import time
import enchant
start = time.time()
import logging
import csv
import functools
import os.path
import re
import subprocess
import time
import xml.etree.ElementTree as ET
import sys
import spacy
import textacy
from scipy import *
from textacy import Vectorizer
import warnings
import configparser as ConfigParser
import sys
import hunspell
from postal.parser import parse_address
from datetime import datetime
import time
import logging
from nltk.corpus import stopwords as nltk_stopwords
from collections import Counter
import csv
import functools
import logging
import random
import re
import xml.etree.ElementTree as ET
import sys
from pathlib import Path
import pickle
import spacy
import textacy
from scipy import *
import sys
import os
csv.field_size_limit(sys.maxsize)
import pickle
FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/"
# load config
config_ini = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/config.ini"
config_ini = FILEPATH + "config.ini"
config = ConfigParser.ConfigParser()
with open(config_ini) as f:
@ -90,7 +28,7 @@ with open(config_ini) as f:
# config logging
filename = config.get("logging","filename")
filename = FILEPATH + config.get("logging","filename")
level = config.get("logging","level")
if level == "INFO":
level = logging.INFO
@ -188,10 +126,13 @@ def printRandomDoc(textacyCorpus):
:param textacyCorpus:
"""
print()
printlog("len(textacyCorpus) = %i" % len(textacyCorpus))
randIndex = int((len(textacyCorpus) - 1) * random.random())
printlog("Index: {0} ; Text: {1} ; Metadata: {2}\n".format(randIndex, textacyCorpus[randIndex].text,
textacyCorpus[randIndex].metadata))
if len(textacyCorpus) == 0:
printlog("NO DOCS IN CORPUS")
else:
printlog("len(textacyCorpus) = %i" % len(textacyCorpus))
randIndex = int((len(textacyCorpus) - 1) * random.random())
printlog("Index: {0} ; Text: {1} ; Metadata: {2}\n".format(randIndex, textacyCorpus[randIndex].text,
textacyCorpus[randIndex].metadata))
print()
@ -239,14 +180,14 @@ def load_corpus(corpus_path, corpus_name, lang="de"):
Load textacy-Corpus including spacy-parser out from file
:param corpus_path: str
:param corpus_name: str (should content the language like "_de_")
:param lang: str language code)
:param lang: str (language code) ir spacy.Language
:return: texracy.Corpus, spacy.language
"""
#ckeck for language
if "_de_" in corpus_name:
if "de_" in corpus_name:
lang="de"
elif "_en_" in corpus_name:
elif "en_" in corpus_name:
lang ="en"

View File

@ -1,76 +1,23 @@
# -*- coding: utf-8 -*-
from datetime import datetime
print(datetime.now())
from datetime import datetime
import time
import logging
from stop_words import get_stop_words
#import words as words
from nltk.corpus import stopwords as nltk_stopwords
from collections import Counter
import csv
import re
import xml.etree.ElementTree as ET
import spacy
import textacy
from scipy import *
import sys
csv.field_size_limit(sys.maxsize)
import pickle
import configparser as ConfigParser
from miscellaneous import *
import time
from datetime import datetime
import logging
from nltk.corpus import stopwords
import csv
import functools
import re
import xml.etree.ElementTree as ET
import spacy
import textacy
from scipy import *
import sys
csv.field_size_limit(sys.maxsize)
import time
import logging
from nltk.corpus import stopwords
import csv
import functools
import re
import xml.etree.ElementTree as ET
import spacy
import textacy
from scipy import *
import sys
import os
csv.field_size_limit(sys.maxsize)
import pickle
FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/"
# load config
config_ini = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/config.ini"
config_ini = FILEPATH + "config.ini"
config = ConfigParser.ConfigParser()
with open(config_ini) as f:
@ -178,7 +125,6 @@ def getFirstSynonym(word, thesaurus=THESAURUS):
# http://wortschatz.uni-leipzig.de/en/download
import re
from collections import Counter
def words(text): return re.findall(r'\w+', text.lower())
@ -250,7 +196,7 @@ def stringcleaning(stringstream):
string = " ".join(re.compile(REGEX_SPECIALCHAR).split(string))
# cut_after
word = "gruss"
word = "gruss" #idee addressen enfernen --> postal.parser
string = string.rpartition(word)[0] if word in string else string
# lemmatize
@ -347,29 +293,23 @@ def processDictstream(dictstream, funcdict, parser):
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/preprocessing.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_preprocessing.log &"
path2thesaurus_dict = config.get("thesaurus","pickle_file")
path2thesaurus_dict = FILEPATH + config.get("thesaurus","pickle_file")
path2wordsdict = config.get("spellchecking", "pickle_file")
path2wordsdict = FILEPATH + config.get("spellchecking", "pickle_file")
path2lemmadict = config.get("lemmatization","pickle_file")
path2lemmadict = FILEPATH + config.get("lemmatization","pickle_file")
path2nouns_list = config.get("nouns","pickle_file")
path2nouns_list = FILEPATH + config.get("nouns","pickle_file")
path2firstnameslist = config.get("firstnames","pickle_file")
path2firstnameslist = FILEPATH + config.get("firstnames","pickle_file")
path2stopwordlist = config.get("de_stopwords","pickle_file")
path2stopwordlist = FILEPATH + config.get("de_stopwords","pickle_file")
corpus_de_path = config.get("de_corpus", "path")
raw_de_name = config.get("de_corpus", "raw")
pre_de_name = config.get("de_corpus", "pre")
corpus_de_path = FILEPATH + config.get("de_corpus", "path")
corpus_en_path = config.get("en_corpus", "path")
raw_en_name = config.get("en_corpus", "raw")
pre_en_name = config.get("en_corpus", "pre")
corpus_en_path = FILEPATH + config.get("en_corpus", "path")
custom_words = ["geehrt", "dame", "herr", "hilfe", "problem", "lauten", "bedanken", "voraus",
@ -383,10 +323,8 @@ custom_words = ["geehrt", "dame", "herr", "hilfe", "problem", "lauten", "bedanke
"funktionieren", "kollege", "pruefen", "hoffen"
]
filter_tokens = [
# removeENT(["PERSON"]),
# idee addressen enfernen #bisher mit cut_after("gruss") --> postal.parser
keepNouns(),
@ -403,8 +341,8 @@ filter_tokens = [
]
#todo filtertokens haut alle raus
filter_tokens = None
clean_in_meta = {
"Solution": [removePOS(["SPACE"])],
@ -412,17 +350,43 @@ clean_in_meta = {
"categoryName": [removePOS(["SPACE", "PUNCT"])]
}
def preprocessCorpus(corpus_path, filter_tokens, clean_in_meta, lang="de", printrandom=10):
printlog("Preprocess {0}_corpus at {1}".format(lang,datetime.now()))
rawCorpus_name = lang + "_raw_ticket"
preCorpus_name = lang + "_pre_ticket"
#load raw corpus and create new one
raw_corpus, parser = load_corpus(corpus_name=rawCorpus_name, corpus_path=corpus_path)
corpus = textacy.Corpus(parser)
## process and add files to textacy-corpi,
corpus.add_texts(
processContentstream(corpus2Text(raw_corpus), token_filterlist=filter_tokens, parser=parser),
processDictstream(corpus2Meta(raw_corpus), clean_in_meta,parser=parser)
)
# leere docs aus corpi kicken
corpus.remove(lambda doc: len(doc) == 0)
for i in range(printrandom):
printRandomDoc(corpus)
#save corpus
save_corpus(corpus=corpus, corpus_path=corpus_path, corpus_name=preCorpus_name)
def main():
start = time.time()
printlog("Preprocessing: {0}".format(datetime.now()))
THESAURUS = load_obj(path2thesaurus_dict)
@ -432,44 +396,9 @@ def main():
NOUNS = load_obj(path2nouns_list)
VORNAMEN = load_obj(path2firstnameslist)
preprocessCorpus(corpus_de_path, filter_tokens, clean_in_meta, "de" )
#load raw corpus and create new one
raw_de_corpus, DE_PARSER = load_corpus(corpus_name=raw_de_name, corpus_path=corpus_de_path)
raw_en_corpus, EN_PARSER = load_corpus(corpus_name=raw_en_name, corpus_path=corpus_en_path)
de_corpus = textacy.Corpus(DE_PARSER)
en_corpus = textacy.Corpus(EN_PARSER)
## process and add files to textacy-corpi,
printlog("Preprocess and add texts to textacy-corpi")
de_corpus.add_texts(
processContentstream(corpus2Text(raw_de_corpus), token_filterlist=filter_tokens, parser=DE_PARSER),
processDictstream(corpus2Meta(raw_de_corpus), clean_in_meta,parser=raw_de_corpus.lang)
)
en_corpus.add_texts(
processContentstream(corpus2Text(raw_en_corpus), token_filterlist=filter_tokens, parser=EN_PARSER),
processDictstream(corpus2Meta(raw_en_corpus), clean_in_meta,parser=raw_en_corpus.lang)
)
# leere docs aus corpi kicken
de_corpus.remove(lambda doc: len(doc) == 0)
en_corpus.remove(lambda doc: len(doc) == 0)
for i in range(20):
printRandomDoc(de_corpus)
#printRandomDoc(en_corpus)
#save corpi
save_corpus(corpus=de_corpus, corpus_path=corpus_de_path, corpus_name=pre_de_name)
save_corpus(corpus=en_corpus, corpus_path=corpus_en_path, corpus_name=pre_en_name)
preprocessCorpus(corpus_en_path, filter_tokens, clean_in_meta, "en" )
end = time.time()

View File

@ -1,10 +1,11 @@
# -*- coding: utf-8 -*-
import re
import time
import json
import spacy
import textacy
#import spacy
#import textacy
from functools import reduce
start = time.time()
@ -15,7 +16,6 @@ from datetime import datetime
import xml.etree.ElementTree as ET
print(datetime.now())
"""
PARSER=spacy.load("de")
@ -55,19 +55,8 @@ print(corpi)
"""
import pickle
def save_obj(obj, path):
with open(path + '.pkl', 'wb') as f:
pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
def load_obj(path ):
with open(path + '.pkl', 'rb') as f:
return pickle.load(f)
"""
def load_corpus(corpus_path, corpus_name, lang="de"):
from pathlib import Path
@ -95,7 +84,7 @@ def load_corpus(corpus_path, corpus_name, lang="de"):
corpus.add_doc(
textacy.Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata))
return corpus
"""
import os
a = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stopwords_1.txt"
b = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stopwords_2.txt"
@ -103,37 +92,16 @@ d = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stopwords_3.
c = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/en_stopwords_1.txt"
liste = [a,b,c,d]
de_filepaths = [path for path in liste if os.path.basename(path).split("_")[0]=='de' and os.path.basename(path).split("_")[1]=='stopwords']
from nltk.corpus import stopwords as nltk_stopwords
from stop_words import get_stop_words
import spacy
from miscellaneous import *
# from packages
de_stop_words1 = list(get_stop_words("de"))
de_stop_words2 = list(nltk_stopwords.words('german'))
de_stop_words3 = list(__import__("spacy.de", globals(), locals(), ['object']).STOP_WORDS)
# from files
de_stop_words_list = [list(textacy.fileio.read_file_lines(path)) for path in de_filepaths]
de_stop_words4 = [item for sublist in de_stop_words_list for item in sublist]
#print(de_stop_words4)
de_stop_words = list(set(map(replaceRockDots(),list(map(textacy.preprocess.normalize_whitespace, de_stop_words1 + de_stop_words2 + de_stop_words3 + de_stop_words4)))))
print(len(de_stop_words))
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/testra.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_testra.log &"
scriptpath = os.path.dirname(os.path.realpath(__file__))
"""
# THESAURUS
lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries_small.xml"
lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries.xml"
@ -212,7 +180,7 @@ def build_thesaurus(path2lexicalentries):#, path2synsets):
pass
return thesaurus
"""
for r in synroot:
for element in r:
@ -223,9 +191,8 @@ def build_thesaurus(path2lexicalentries):#, path2synsets):
if id not in synset2Words.keys():
synset2Words[id] = "WORD"
"""
"""
"""
from postal.parser import parse_address
@ -557,6 +524,7 @@ de_stop_words = list(map(textacy.preprocess.normalize_whitespace,de_stop_words))
textacy.fileio.write_file_lines(de_stop_words,"german_stopwords.txt")
"""
end = time.time()
print("\n\n\nTime Elapsed Topic:{0}\n\n".format(end - start))

View File

@ -94,9 +94,6 @@ def printvecotorization(ngrams=1, min_df=1, max_df=1.0, weighting='tf', named_en
printlog("doc_term_matrix: {0}".format(doc_term_matrix))
printlog("id2term: {0}".format(id2term))
corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/"
corpus_name = "de_corpus"
@ -104,11 +101,6 @@ corpus_name = "de_corpus"
de_corpus = load_corpus(corpus_name=corpus_name,corpus_path=corpus_path)
for i in range(5):
printRandomDoc(de_corpus)
# todo gescheites tf(-idf) maß finden
ngrams = 1
@ -128,6 +120,7 @@ printvecotorization(ngrams=(1, 2), min_df=1, max_df=0.5, weighting=weighting)
printvecotorization(ngrams=(1, 2), min_df=1, max_df=0.8, weighting=weighting)
"""
# build citionary of ticketcategories
labelist = []
@ -139,6 +132,7 @@ LABELDICT = {k: v for v, k in enumerate(labelist)}
printlog(str("LABELDICT: {0}".format(LABELDICT)))
def textacyTopicModeling(ngrams, min_df, max_df, topicModel='lda', n_topics=len(LABELDICT), named_entities=False,
corpus=de_corpus):
printlog(