regex zum weiteren cleaning hinzugefügt
This commit is contained in:
parent
13ec7cdef4
commit
092052dfe1
20
config.ini
20
config.ini
|
@ -1,7 +1,19 @@
|
||||||
[default]
|
[filepath]
|
||||||
|
|
||||||
|
thesauruspath = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/openthesaurus.csv
|
||||||
|
|
||||||
|
path2xml = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/ticketSamples.xml
|
||||||
|
|
||||||
|
path2csv = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_2017-09-13.csv
|
||||||
|
|
||||||
|
small = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_small.csv
|
||||||
|
|
||||||
|
|
||||||
|
logfile = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/topicModelTickets.log
|
||||||
|
|
||||||
|
lemmas = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemmatization-de.txt
|
||||||
|
|
||||||
|
|
||||||
thesauruspath = openthesaurus.csv
|
|
||||||
path2xml = ticketSamples.xml
|
|
||||||
|
|
||||||
language = de
|
language = de
|
||||||
|
|
||||||
|
@ -9,7 +21,7 @@ language = de
|
||||||
|
|
||||||
ents2keep = WORK_OF_ART,ORG,PRODUCT,LOC
|
ents2keep = WORK_OF_ART,ORG,PRODUCT,LOC
|
||||||
|
|
||||||
custom_words = grüßen,fragen
|
custom_words = grüßen,fragen,damen,probleme,herren,dank
|
||||||
|
|
||||||
#lemmatize = True
|
#lemmatize = True
|
||||||
|
|
||||||
|
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
File diff suppressed because it is too large
Load Diff
|
@ -22,7 +22,7 @@ config = ConfigParser.ConfigParser()
|
||||||
with open("config.ini") as f:
|
with open("config.ini") as f:
|
||||||
config.read_file(f)
|
config.read_file(f)
|
||||||
|
|
||||||
|
#todo print&log
|
||||||
|
|
||||||
path2xml = config.get("default","path2xml")
|
path2xml = config.get("default","path2xml")
|
||||||
|
|
||||||
|
@ -281,7 +281,7 @@ topicModel = 'lda'
|
||||||
weighting = ('tf' if topicModel == 'lda' else 'tfidf')
|
weighting = ('tf' if topicModel == 'lda' else 'tfidf')
|
||||||
|
|
||||||
top_topic_words = 5
|
top_topic_words = 5
|
||||||
top_document_labels_per_topic = 2
|
top_document_labels_per_topic = 5
|
||||||
|
|
||||||
n_topics = len(set(corpus[0].metadata.keys()))+1 #+1 wegen einem default-topic
|
n_topics = len(set(corpus[0].metadata.keys()))+1 #+1 wegen einem default-topic
|
||||||
|
|
||||||
|
@ -311,7 +311,7 @@ id2term = vectorizer.__getattribute__("id_to_term")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
##################### LSA, LDA, NMF Topic Modeling via Textacy ##############################################
|
##################### LSA, LDA, NMF Topic Modeling via Textacy ##############################################
|
||||||
|
|
||||||
# Initialize and train a topic model
|
# Initialize and train a topic model
|
||||||
|
@ -339,7 +339,7 @@ for topic_idx, top_docs in model.top_topic_docs(doc_topic_matrix, top_n=top_docu
|
||||||
print()
|
print()
|
||||||
print()
|
print()
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
200
test.py
200
test.py
|
@ -1,7 +1,11 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
import time
|
import time
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
|
||||||
|
import logging
|
||||||
|
|
||||||
import csv
|
import csv
|
||||||
import functools
|
import functools
|
||||||
|
@ -16,45 +20,55 @@ import textacy
|
||||||
from scipy import *
|
from scipy import *
|
||||||
from textacy import Vectorizer
|
from textacy import Vectorizer
|
||||||
import warnings
|
import warnings
|
||||||
csv.field_size_limit(sys.maxsize)
|
import configparser as ConfigParser
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
|
|
||||||
old_stdout = sys.stdout
|
csv.field_size_limit(sys.maxsize)
|
||||||
|
|
||||||
log_file = open("printout.log","w")
|
|
||||||
|
|
||||||
sys.stdout = log_file
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Load the configuration file
|
# Load the configuration file
|
||||||
import configparser as ConfigParser
|
config_ini = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/config.ini"
|
||||||
|
|
||||||
config = ConfigParser.ConfigParser()
|
config = ConfigParser.ConfigParser()
|
||||||
with open("config.ini") as f:
|
with open(config_ini) as f:
|
||||||
config.read_file(f)
|
config.read_file(f)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# config logging
|
||||||
|
logging.basicConfig(filename=config.get("filepath","logfile"), level=logging.INFO)
|
||||||
|
|
||||||
path2csv = "M42-Export/Tickets_small.csv"
|
|
||||||
#path2csv = "M42-Export/Tickets_2017-09-13.csv" # 21167
|
|
||||||
|
|
||||||
path2xml = config.get("default","path2xml")
|
|
||||||
thesauruspath = config.get("default","thesauruspath")
|
|
||||||
|
|
||||||
DE_PARSER = spacy.load("de")
|
|
||||||
de_stop_words=list(__import__("spacy." + DE_PARSER.lang, globals(), locals(), ['object']).STOP_WORDS)
|
|
||||||
|
|
||||||
|
thesauruspath = config.get("filepath","thesauruspath")
|
||||||
THESAURUS = list(textacy.fileio.read_csv(thesauruspath, delimiter=";"))
|
THESAURUS = list(textacy.fileio.read_csv(thesauruspath, delimiter=";"))
|
||||||
|
|
||||||
|
|
||||||
|
DE_PARSER = spacy.load("de") #todo spacherkennung idee: verschiedene Corpi für verschiedene Sprachen
|
||||||
|
de_stop_words=list(__import__("spacy." + DE_PARSER.lang, globals(), locals(), ['object']).STOP_WORDS)
|
||||||
|
|
||||||
|
|
||||||
|
LEMMAS=config.get("filepath","lemmas")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
############# misc
|
############# misc
|
||||||
|
|
||||||
|
def printlog(string, level="INFO"):
|
||||||
|
"""log and prints"""
|
||||||
|
print(string)
|
||||||
|
if level=="INFO":
|
||||||
|
logging.info(string)
|
||||||
|
elif level=="DEBUG":
|
||||||
|
logging.debug(string)
|
||||||
|
elif level == "WARNING":
|
||||||
|
logging.warning(string)
|
||||||
|
printlog("Load functions")
|
||||||
|
|
||||||
def compose(*functions):
|
def compose(*functions):
|
||||||
def compose2(f, g):
|
def compose2(f, g):
|
||||||
return lambda x: f(g(x))
|
return lambda x: f(g(x))
|
||||||
|
@ -84,17 +98,17 @@ def get_calling_function():
|
||||||
return func
|
return func
|
||||||
raise AttributeError("func not found")
|
raise AttributeError("func not found")
|
||||||
|
|
||||||
|
|
||||||
def printRandomDoc(textacyCorpus):
|
def printRandomDoc(textacyCorpus):
|
||||||
import random
|
import random
|
||||||
print()
|
print()
|
||||||
|
|
||||||
print("len(textacyCorpus) = %i" % len(textacyCorpus))
|
printlog("len(textacyCorpus) = %i" % len(textacyCorpus))
|
||||||
randIndex = int((len(textacyCorpus) - 1) * random.random())
|
randIndex = int((len(textacyCorpus) - 1) * random.random())
|
||||||
print("Index: {0} ; Text: {1} ; Metadata: {2}".format(randIndex, textacyCorpus[randIndex].text, textacyCorpus[randIndex].metadata))
|
printlog("Index: {0} ; Text: {1} ; Metadata: {2}".format(randIndex, textacyCorpus[randIndex].text, textacyCorpus[randIndex].metadata))
|
||||||
|
|
||||||
print()
|
print()
|
||||||
|
|
||||||
|
|
||||||
############# load xml
|
############# load xml
|
||||||
def generateMainTextfromTicketXML(path2xml, main_textfield='Description'):
|
def generateMainTextfromTicketXML(path2xml, main_textfield='Description'):
|
||||||
"""
|
"""
|
||||||
|
@ -111,6 +125,7 @@ def generateMainTextfromTicketXML(path2xml, main_textfield='Description'):
|
||||||
for field in ticket:
|
for field in ticket:
|
||||||
if field.tag == main_textfield:
|
if field.tag == main_textfield:
|
||||||
yield field.text
|
yield field.text
|
||||||
|
|
||||||
def generateMetadatafromTicketXML(path2xml, leave_out=['Description']):
|
def generateMetadatafromTicketXML(path2xml, leave_out=['Description']):
|
||||||
tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
|
tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
|
||||||
root = tree.getroot()
|
root = tree.getroot()
|
||||||
|
@ -144,6 +159,7 @@ def csv_to_contentStream(path2csv: str, content_collumn_name: str):
|
||||||
content_collumn = j
|
content_collumn = j
|
||||||
else:
|
else:
|
||||||
yield lst[content_collumn]
|
yield lst[content_collumn]
|
||||||
|
|
||||||
def csv_to_metaStream(path2csv: str, metalist: [str]):
|
def csv_to_metaStream(path2csv: str, metalist: [str]):
|
||||||
"""
|
"""
|
||||||
:param path2csv: string
|
:param path2csv: string
|
||||||
|
@ -171,9 +187,8 @@ def csv_to_metaStream(path2csv: str, metalist: [str]):
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
############################################ Preprocessing ##############################################
|
||||||
|
|
||||||
print("############################################ Preprocessing ##############################################")
|
|
||||||
print()
|
|
||||||
|
|
||||||
############# on str-gen
|
############# on str-gen
|
||||||
|
|
||||||
|
@ -211,7 +226,11 @@ def processTextstream(textstream, funclist, parser=DE_PARSER):
|
||||||
pipe = parser.pipe(textstream)
|
pipe = parser.pipe(textstream)
|
||||||
|
|
||||||
for doc in pipe:
|
for doc in pipe:
|
||||||
tokens = [tok for tok in doc]
|
|
||||||
|
tokens = []
|
||||||
|
for tok in doc:
|
||||||
|
tokens.append(tok)
|
||||||
|
|
||||||
tokens = processTokens(tokens,funclist,parser)
|
tokens = processTokens(tokens,funclist,parser)
|
||||||
yield " ".join([tok.lower_ for tok in tokens])
|
yield " ".join([tok.lower_ for tok in tokens])
|
||||||
|
|
||||||
|
@ -264,7 +283,6 @@ def removePOS(pos_list)-> bool:
|
||||||
return ret
|
return ret
|
||||||
|
|
||||||
def removeWords(words, keep=None)-> bool:
|
def removeWords(words, keep=None)-> bool:
|
||||||
#todo in:str oder str-list
|
|
||||||
if hasattr(keep, '__iter__'):
|
if hasattr(keep, '__iter__'):
|
||||||
for k in keep:
|
for k in keep:
|
||||||
try:
|
try:
|
||||||
|
@ -289,13 +307,37 @@ def removeENT(ent_list) -> bool:
|
||||||
ret.__annotations__ = get_calling_function().__annotations__
|
ret.__annotations__ = get_calling_function().__annotations__
|
||||||
return ret
|
return ret
|
||||||
|
|
||||||
def lemmatize() -> str:
|
def remove_words_containing_Numbers() -> bool:
|
||||||
ret = lambda tok: tok.lemma_
|
ret = lambda tok: not bool(re.search('\d', tok.lower_))
|
||||||
|
|
||||||
ret.__annotations__ = get_calling_function().__annotations__
|
ret.__annotations__ = get_calling_function().__annotations__
|
||||||
return ret
|
return ret
|
||||||
|
|
||||||
|
|
||||||
|
def remove_words_containing_specialCharacters() -> bool:
|
||||||
|
ret = lambda tok: not bool(re.search(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./<>?]', tok.lower_))
|
||||||
|
|
||||||
|
ret.__annotations__ = get_calling_function().__annotations__
|
||||||
|
return ret
|
||||||
|
|
||||||
|
def lemmatizeWord(word,filepath=LEMMAS):
|
||||||
|
"""http://www.lexiconista.com/datasets/lemmatization/"""
|
||||||
|
for line in list(textacy.fileio.read_file_lines(filepath=filepath)):
|
||||||
|
if word.lower() == line.split()[1].strip().lower():
|
||||||
|
return line.split()[0].strip().lower()
|
||||||
|
return word.lower() # falls nix gefunden wurde
|
||||||
|
|
||||||
|
def lemmatize() -> str:
|
||||||
|
ret = lambda tok: lemmatizeWord(tok.lower_)
|
||||||
|
|
||||||
|
ret.__annotations__ = get_calling_function().__annotations__
|
||||||
|
return ret
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def normalizeSynonyms(default_return_first_Syn=False) -> str:
|
def normalizeSynonyms(default_return_first_Syn=False) -> str:
|
||||||
ret = lambda tok : getFirstSynonym(tok.lower_, default_return_first_Syn=default_return_first_Syn)
|
ret = lambda tok : getFirstSynonym(tok.lower_, default_return_first_Syn=default_return_first_Syn)
|
||||||
|
|
||||||
|
@ -377,6 +419,15 @@ def replacePhonenumbers(replace_with="PHONENUMBER") -> str:
|
||||||
return ret
|
return ret
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def fixUnicode() -> str:
|
||||||
|
ret = lambda tok: textacy.preprocess.fix_bad_unicode(tok.lower_, normalization=u'NFC')
|
||||||
|
|
||||||
|
ret.__annotations__ = get_calling_function().__annotations__
|
||||||
|
return ret
|
||||||
|
|
||||||
|
|
||||||
def resolveAbbreviations():
|
def resolveAbbreviations():
|
||||||
pass #todo
|
pass #todo
|
||||||
|
|
||||||
|
@ -396,10 +447,44 @@ def lower() -> spacy.tokens.Doc:
|
||||||
return ret
|
return ret
|
||||||
|
|
||||||
|
|
||||||
|
################################################################################################################
|
||||||
|
|
||||||
|
|
||||||
|
path2xml = config.get("filepath","path2xml")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
path2csv = config.get("filepath","path2csv")
|
||||||
|
path2csv = "M42-Export/Tickets_med.csv"
|
||||||
|
printlog("CSV: {0}".format(path2csv))
|
||||||
|
|
||||||
|
|
||||||
ticketcorpus = textacy.Corpus(DE_PARSER)
|
ticketcorpus = textacy.Corpus(DE_PARSER)
|
||||||
|
|
||||||
|
|
||||||
|
#idee ß zu ss ändern? prinzipiell?
|
||||||
|
|
||||||
|
"""
|
||||||
|
vllt kategorien in unterkategorien aufteilen
|
||||||
|
|
||||||
|
allg:
|
||||||
|
utf-korregieren, bei sonderzeichen wörter trennen
|
||||||
|
namen raus
|
||||||
|
|
||||||
|
emails, urls, nummern raus
|
||||||
|
vllt sogar alles, was ebend jenes enthält (oder auf .toplvldomain bzw. sonderzeichen enthält oder alles was ein @ enthält
|
||||||
|
|
||||||
|
sinnvoller wörter von müll trennen: 8203;verfügung
|
||||||
|
|
||||||
|
abkürzungen raus: m.a, o.ä.
|
||||||
|
|
||||||
|
|
||||||
|
sinnlose bsp: nr54065467 455a33c5 tvt?= ------problem--------
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
metaliste = [
|
metaliste = [
|
||||||
"Subject",
|
"Subject",
|
||||||
|
@ -409,45 +494,55 @@ metaliste = [
|
||||||
|
|
||||||
clean_in_meta = {
|
clean_in_meta = {
|
||||||
"Solution":[removePOS(["SPACE"]),lower()],
|
"Solution":[removePOS(["SPACE"]),lower()],
|
||||||
"Subject":[removePOS(["SPACE","PUNCT"]),lower()]
|
"Subject":[removePOS(["SPACE","PUNCT"]),lower()],
|
||||||
|
"categoryName": [removePOS(["SPACE", "PUNCT"]), lower()]
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
printlog("Start Preprocessing")
|
||||||
|
|
||||||
clean_in_content=[
|
clean_in_content=[
|
||||||
#removePOS(["SPACE","PUNCT","NUM"]),
|
|
||||||
keepPOS(["NOUN"]),
|
|
||||||
removePOS(["SPACE","PUNCT","NUM"]),
|
removePOS(["SPACE","PUNCT","NUM"]),
|
||||||
|
remove_words_containing_Numbers(),
|
||||||
|
remove_words_containing_specialCharacters(),
|
||||||
|
|
||||||
#replaceURLs(),
|
#replaceURLs(),
|
||||||
#replaceEmails(),
|
#replaceEmails(),
|
||||||
lemmatize(),
|
#fixUnicode(),
|
||||||
#removeWords(de_stop_words),
|
|
||||||
keepUniqeTokens(),
|
#lemmatize(),
|
||||||
removePOS(["PUNCT"]),
|
#removeWords(de_stop_words + config.get("preprocessing","custom_words").split(",")),
|
||||||
|
|
||||||
|
#removeENT("PERSON"),
|
||||||
|
#keepPOS(["NOUN"]),
|
||||||
|
#keepUniqeTokens(),
|
||||||
|
#keepENT(config.get("preprocessing","ents2keep"))
|
||||||
|
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## add files to textacy-corpus,
|
## add files to textacy-corpus,
|
||||||
print("add texts to textacy-corpus...")
|
printlog("add texts to textacy-corpus")
|
||||||
ticketcorpus.add_texts(
|
ticketcorpus.add_texts(
|
||||||
processTextstream(csv_to_contentStream(path2csv,"Description"), clean_in_content),
|
processTextstream(csv_to_contentStream(path2csv,"Description"), clean_in_content),
|
||||||
processDictstream(csv_to_metaStream(path2csv,metaliste),clean_in_meta)
|
processDictstream(csv_to_metaStream(path2csv,metaliste),clean_in_meta)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
for i in range(10):
|
||||||
printRandomDoc(ticketcorpus)
|
printRandomDoc(ticketcorpus)
|
||||||
|
|
||||||
|
|
||||||
end = time.time()
|
end = time.time()
|
||||||
print("\n\nTime Elapsed Preprocessing:{0}\n\n".format(end - start))
|
printlog("Time Elapsed Preprocessing:{0} min".format((end - start)/60))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
print("############################################ Topic Modeling #############################################")
|
############################################ Topic Modeling #############################################
|
||||||
print("\n\n")
|
print("\n\n")
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
|
||||||
|
@ -482,7 +577,7 @@ def generate_labled_lines(textacyCorpus):
|
||||||
|
|
||||||
####################'####################' todo alles in config
|
####################'####################' todo alles in config
|
||||||
|
|
||||||
ngrams = (1)
|
ngrams = 1
|
||||||
|
|
||||||
min_df = 0
|
min_df = 0
|
||||||
max_df = 1.0
|
max_df = 1.0
|
||||||
|
@ -508,7 +603,7 @@ n_topics = len(LABELDICT)#len(set(ticketcorpus[0].metadata.keys()))+1 #+1 wegen
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
print("\nvectorize corpus...")
|
printlog("vectorize corpus...")
|
||||||
vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df)
|
vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df)
|
||||||
|
|
||||||
terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=False, as_strings=True) for doc in ticketcorpus)
|
terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=False, as_strings=True) for doc in ticketcorpus)
|
||||||
|
@ -528,12 +623,12 @@ id2term = vectorizer.__getattribute__("id_to_term")
|
||||||
##################### LSA, LDA, NMF Topic Modeling via Textacy ##############################################
|
##################### LSA, LDA, NMF Topic Modeling via Textacy ##############################################
|
||||||
|
|
||||||
# Initialize and train a topic model
|
# Initialize and train a topic model
|
||||||
print("\nInitialize and train a topic model..")
|
printlog("Initialize and train a topic model..")
|
||||||
model = textacy.tm.TopicModel(topicModel, n_topics=n_topics)
|
model = textacy.tm.TopicModel(topicModel, n_topics=n_topics)
|
||||||
model.fit(doc_term_matrix)
|
model.fit(doc_term_matrix)
|
||||||
|
|
||||||
#Transform the corpus and interpret our model:
|
#Transform the corpus and interpret our model:
|
||||||
print("Transform the corpus and interpret our model..")
|
printlog("Transform the corpus and interpret our model..")
|
||||||
doc_topic_matrix = model.transform(doc_term_matrix)
|
doc_topic_matrix = model.transform(doc_term_matrix)
|
||||||
print()
|
print()
|
||||||
|
|
||||||
|
@ -552,7 +647,7 @@ for topic_idx, top_docs in model.top_topic_docs(doc_topic_matrix, top_n=top_docu
|
||||||
print()
|
print()
|
||||||
print()
|
print()
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -560,19 +655,20 @@ print()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
jgibbsLLDA_root = "java_LabledLDA/"
|
jgibbsLLDA_root = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/java_LabledLDA/"
|
||||||
filepath = "{0}models/tickets/tickets.gz".format(jgibbsLLDA_root)
|
LLDA_filepath = "{0}models/tickets/tickets.gz".format(jgibbsLLDA_root)
|
||||||
|
|
||||||
|
|
||||||
#create file
|
#create file
|
||||||
textacy.fileio.write_file_lines(generate_labled_lines(ticketcorpus),filepath=filepath)
|
textacy.fileio.write_file_lines(generate_labled_lines(ticketcorpus), filepath=LLDA_filepath)
|
||||||
|
|
||||||
|
|
||||||
# wait for file to exist
|
# wait for file to exist
|
||||||
while not os.path.exists(filepath):
|
while not os.path.exists(LLDA_filepath):
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
|
|
||||||
print("\nstart LLDA:\n")
|
print("\n\n")
|
||||||
|
printlog("start LLDA:")
|
||||||
#run JGibsslda file
|
#run JGibsslda file
|
||||||
FNULL = open(os.devnull, 'w') # supress output
|
FNULL = open(os.devnull, 'w') # supress output
|
||||||
subprocess.call(["java",
|
subprocess.call(["java",
|
||||||
|
@ -586,7 +682,7 @@ subprocess.call(["java",
|
||||||
|
|
||||||
# ANMERKUNG: Dateien sind versteckt. zu finden in models/
|
# ANMERKUNG: Dateien sind versteckt. zu finden in models/
|
||||||
|
|
||||||
#print twords
|
#twords
|
||||||
subprocess.call(["gzip",
|
subprocess.call(["gzip",
|
||||||
"-dc",
|
"-dc",
|
||||||
"{0}/models/tickets/.twords.gz".format(jgibbsLLDA_root)])
|
"{0}/models/tickets/.twords.gz".format(jgibbsLLDA_root)])
|
||||||
|
@ -594,7 +690,7 @@ subprocess.call(["gzip",
|
||||||
print()
|
print()
|
||||||
print()
|
print()
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -602,10 +698,4 @@ print()
|
||||||
|
|
||||||
|
|
||||||
end = time.time()
|
end = time.time()
|
||||||
print("\n\n\nTime Elapsed Topic Modeling:{0}\n\n".format(end - start))
|
printlog("\n\n\nTime Elapsed Topic Modeling:{0}\n\n".format(end - start))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
sys.stdout = old_stdout
|
|
||||||
|
|
||||||
log_file.close()
|
|
Loading…
Reference in New Issue