bereit für erstes topic modeling mit gesamter tickets.csv
This commit is contained in:
parent
26c0f37ec8
commit
13ec7cdef4
File diff suppressed because one or more lines are too long
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
280
test.py
280
test.py
|
@ -18,6 +18,16 @@ from textacy import Vectorizer
|
|||
import warnings
|
||||
csv.field_size_limit(sys.maxsize)
|
||||
|
||||
import sys
|
||||
|
||||
|
||||
old_stdout = sys.stdout
|
||||
|
||||
log_file = open("printout.log","w")
|
||||
|
||||
sys.stdout = log_file
|
||||
|
||||
|
||||
|
||||
|
||||
# Load the configuration file
|
||||
|
@ -28,17 +38,17 @@ with open("config.ini") as f:
|
|||
|
||||
|
||||
|
||||
|
||||
|
||||
path2csv = "M42-Export/Tickets_small.csv"
|
||||
#path2csv = "M42-Export/Tickets_2017-09-13.csv" # 21167
|
||||
|
||||
path2xml = config.get("default","path2xml")
|
||||
thesauruspath = config.get("default","thesauruspath")
|
||||
|
||||
|
||||
DE_PARSER = spacy.load("de")
|
||||
|
||||
de_stop_words=list(__import__("spacy." + DE_PARSER.lang, globals(), locals(), ['object']).STOP_WORDS)
|
||||
|
||||
|
||||
corpus = textacy.Corpus(DE_PARSER)
|
||||
|
||||
THESAURUS = list(textacy.fileio.read_csv(thesauruspath, delimiter=";"))
|
||||
|
||||
|
||||
|
@ -85,11 +95,8 @@ def printRandomDoc(textacyCorpus):
|
|||
print()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
############# on xml
|
||||
def generateMainTextfromTicketXML(path2xml, main_textfield='Beschreibung'):
|
||||
############# load xml
|
||||
def generateMainTextfromTicketXML(path2xml, main_textfield='Description'):
|
||||
"""
|
||||
generates strings from XML
|
||||
:param path2xml:
|
||||
|
@ -104,7 +111,7 @@ def generateMainTextfromTicketXML(path2xml, main_textfield='Beschreibung'):
|
|||
for field in ticket:
|
||||
if field.tag == main_textfield:
|
||||
yield field.text
|
||||
def generateMetadatafromTicketXML(path2xml, leave_out=['Beschreibung']):
|
||||
def generateMetadatafromTicketXML(path2xml, leave_out=['Description']):
|
||||
tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
|
||||
root = tree.getroot()
|
||||
|
||||
|
@ -118,8 +125,7 @@ def generateMetadatafromTicketXML(path2xml, leave_out=['Beschreibung']):
|
|||
yield metadata
|
||||
|
||||
|
||||
|
||||
############# on csv
|
||||
############# load csv
|
||||
|
||||
def csv_to_contentStream(path2csv: str, content_collumn_name: str):
|
||||
"""
|
||||
|
@ -164,11 +170,13 @@ def csv_to_metaStream(path2csv: str, metalist: [str]):
|
|||
yield metadata
|
||||
|
||||
|
||||
|
||||
|
||||
print("############################################ Preprocessing ##############################################")
|
||||
print()
|
||||
|
||||
############# on str-gen
|
||||
|
||||
|
||||
|
||||
|
||||
def processTokens(tokens, funclist, parser):
|
||||
# in:tokenlist, funclist
|
||||
# out: tokenlist
|
||||
|
@ -182,36 +190,16 @@ def processTokens(tokens, funclist, parser):
|
|||
tokens = [tok for tok in doc] # nur tokens
|
||||
|
||||
elif 'spacy.tokens.doc.Doc' in str(f.__annotations__):
|
||||
toks = f(tokens)
|
||||
tokens = [tok for tok in toks]
|
||||
|
||||
#todo wirkt gefrickelt
|
||||
doc = parser(" ".join(tok.lower_ for tok in tokens)) # geparsed
|
||||
tokens = f(doc)
|
||||
doc = parser(" ".join(tokens)) # geparsed
|
||||
tokens = [tok for tok in doc] # nur tokens
|
||||
else:
|
||||
warnings.warn("Unknown Annotation while preprocessing. Function: {0}".format(str(f)))
|
||||
|
||||
return tokens
|
||||
|
||||
|
||||
|
||||
|
||||
############# return docs
|
||||
|
||||
def keepUniqueTokens() -> spacy.tokens.Doc:
|
||||
#todo in:tok out:doc
|
||||
ret = lambda doc: (set([tok.lower_ for tok in doc]))
|
||||
|
||||
ret.__annotations__ = get_calling_function().__annotations__
|
||||
return ret
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def processTextstream(textstream, funclist, parser=DE_PARSER):
|
||||
"""
|
||||
:param textstream: string-gen
|
||||
|
@ -261,8 +249,7 @@ def processDictstream(dictstream, funcdict, parser=DE_PARSER):
|
|||
yield result
|
||||
|
||||
|
||||
|
||||
############# return tokens
|
||||
############# return bool
|
||||
|
||||
def keepPOS(pos_list) -> bool:
|
||||
ret = lambda tok : tok.pos_ in pos_list
|
||||
|
@ -352,7 +339,6 @@ def getHauptform(syn_block, word, default_return_first_Syn=False):
|
|||
return word # zur Not, das ursrpüngliche Wort zurückgeben
|
||||
|
||||
|
||||
|
||||
############# return strings
|
||||
|
||||
mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE)
|
||||
|
@ -395,28 +381,50 @@ def resolveAbbreviations():
|
|||
pass #todo
|
||||
|
||||
|
||||
############# return docs #todo das stimmt nicht so ganz, da kommt kein doc raus, sondern n tokenset
|
||||
|
||||
def keepUniqeTokens() -> spacy.tokens.Doc:
|
||||
ret = lambda doc: (set([tok.lower_ for tok in doc]))
|
||||
|
||||
ret.__annotations__ = get_calling_function().__annotations__
|
||||
return ret
|
||||
|
||||
def lower() -> spacy.tokens.Doc:
|
||||
ret = lambda doc: ([tok.lower_ for tok in doc])
|
||||
|
||||
ret.__annotations__ = get_calling_function().__annotations__
|
||||
return ret
|
||||
|
||||
|
||||
|
||||
ticketcorpus = textacy.Corpus(DE_PARSER)
|
||||
|
||||
|
||||
|
||||
metaliste = [
|
||||
"Subject",
|
||||
"categoryName",
|
||||
"Solution"
|
||||
]
|
||||
path2csv = "M42-Export/Tickets_small.csv"
|
||||
|
||||
|
||||
|
||||
clean_in_meta = {
|
||||
"Solution":[removePOS(["SPACE"])],
|
||||
"Subject":[removePOS(["SPACE","PUNCT"])]
|
||||
"Solution":[removePOS(["SPACE"]),lower()],
|
||||
"Subject":[removePOS(["SPACE","PUNCT"]),lower()]
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
clean_in_content=[
|
||||
removePOS(["SPACE","PUNCT","NUM"]),
|
||||
#removePOS(["SPACE","PUNCT","NUM"]),
|
||||
keepPOS(["NOUN"]),
|
||||
replaceURLs(),
|
||||
replaceEmails(),
|
||||
removeWords(de_stop_words),
|
||||
lemmatize()
|
||||
removePOS(["SPACE","PUNCT","NUM"]),
|
||||
#replaceURLs(),
|
||||
#replaceEmails(),
|
||||
lemmatize(),
|
||||
#removeWords(de_stop_words),
|
||||
keepUniqeTokens(),
|
||||
removePOS(["PUNCT"]),
|
||||
|
||||
]
|
||||
|
||||
|
@ -424,17 +432,169 @@ clean_in_content=[
|
|||
|
||||
## add files to textacy-corpus,
|
||||
print("add texts to textacy-corpus...")
|
||||
corpus.add_texts(
|
||||
ticketcorpus.add_texts(
|
||||
processTextstream(csv_to_contentStream(path2csv,"Description"), clean_in_content),
|
||||
processDictstream(csv_to_metaStream(path2csv,metaliste),clean_in_meta)
|
||||
)
|
||||
|
||||
printRandomDoc(corpus)
|
||||
printRandomDoc(ticketcorpus)
|
||||
|
||||
|
||||
end = time.time()
|
||||
print("\n\nTime Elapsed Preprocessing:{0}\n\n".format(end - start))
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
print("############################################ Topic Modeling #############################################")
|
||||
print("\n\n")
|
||||
start = time.time()
|
||||
|
||||
|
||||
|
||||
# build citionary of ticketcategories
|
||||
labelist = []
|
||||
|
||||
for texdoc in ticketcorpus.get(lambda texdoc : texdoc.metadata["categoryName"] not in labelist):
|
||||
labelist.append(texdoc.metadata["categoryName"])
|
||||
|
||||
|
||||
LABELDICT = {k: v for v, k in enumerate(labelist)}
|
||||
print(LABELDICT)
|
||||
|
||||
|
||||
|
||||
def label2ID(label,labeldict=LABELDICT):
|
||||
return labeldict.get(label,len(labeldict))
|
||||
|
||||
def generate_labled_lines(textacyCorpus):
|
||||
for doc in textacyCorpus:
|
||||
# generate [topic1, topic2....] tok1 tok2 tok3 out of corpus
|
||||
yield "[" + str(label2ID(doc.metadata["categoryName"])) + "] " + doc.text
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
####################'####################' todo alles in config
|
||||
|
||||
ngrams = (1)
|
||||
|
||||
min_df = 0
|
||||
max_df = 1.0
|
||||
no_below = 20
|
||||
no_above = 0.5
|
||||
|
||||
topicModel = 'lda'
|
||||
# http://textacy.readthedocs.io/en/latest/api_reference.html#textacy.tm.topic_model.TopicModel.get_doc_topic_matrix
|
||||
weighting = ('tf' if topicModel == 'lda' else 'tfidf')
|
||||
|
||||
top_topic_words = 7
|
||||
top_document_labels_per_topic = 2
|
||||
|
||||
n_topics = len(LABELDICT)#len(set(ticketcorpus[0].metadata.keys()))+1 #+1 wegen einem default-topic
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
####################'####################
|
||||
|
||||
|
||||
|
||||
|
||||
print("\nvectorize corpus...")
|
||||
vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df)
|
||||
|
||||
terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=False, as_strings=True) for doc in ticketcorpus)
|
||||
doc_term_matrix = vectorizer.fit_transform(terms_list)
|
||||
id2term = vectorizer.__getattribute__("id_to_term")
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
##################### LSA, LDA, NMF Topic Modeling via Textacy ##############################################
|
||||
|
||||
# Initialize and train a topic model
|
||||
print("\nInitialize and train a topic model..")
|
||||
model = textacy.tm.TopicModel(topicModel, n_topics=n_topics)
|
||||
model.fit(doc_term_matrix)
|
||||
|
||||
#Transform the corpus and interpret our model:
|
||||
print("Transform the corpus and interpret our model..")
|
||||
doc_topic_matrix = model.transform(doc_term_matrix)
|
||||
print()
|
||||
|
||||
|
||||
for topic_idx, top_terms in model.top_topic_terms(vectorizer.id_to_term, top_n=top_topic_words):
|
||||
print('topic', topic_idx, ':', ' '.join(top_terms))
|
||||
|
||||
|
||||
print()
|
||||
for topic_idx, top_docs in model.top_topic_docs(doc_topic_matrix, top_n=top_document_labels_per_topic):
|
||||
print(topic_idx)
|
||||
for j in top_docs:
|
||||
print(ticketcorpus[j].metadata['categoryName'])
|
||||
|
||||
#####################################################################################################################
|
||||
print()
|
||||
print()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
##################### LLDA Topic Modeling via JGibbsLabledLDA ##############################################
|
||||
|
||||
|
||||
|
||||
jgibbsLLDA_root = "java_LabledLDA/"
|
||||
filepath = "{0}models/tickets/tickets.gz".format(jgibbsLLDA_root)
|
||||
|
||||
|
||||
#create file
|
||||
textacy.fileio.write_file_lines(generate_labled_lines(ticketcorpus),filepath=filepath)
|
||||
|
||||
|
||||
# wait for file to exist
|
||||
while not os.path.exists(filepath):
|
||||
time.sleep(1)
|
||||
|
||||
print("\nstart LLDA:\n")
|
||||
#run JGibsslda file
|
||||
FNULL = open(os.devnull, 'w') # supress output
|
||||
subprocess.call(["java",
|
||||
"-cp", "{0}lib/trove-3.0.3.jar:{0}lib/args4j-2.0.6.jar:{0}out/production/LabledLDA/".format(jgibbsLLDA_root),
|
||||
"jgibblda.LDA",
|
||||
"-est",
|
||||
"-dir", "{0}models/tickets".format(jgibbsLLDA_root),
|
||||
"-dfile","tickets.gz",
|
||||
"-twords",str(top_topic_words),
|
||||
"-ntopics", str(n_topics)], stdout = FNULL)
|
||||
|
||||
# ANMERKUNG: Dateien sind versteckt. zu finden in models/
|
||||
|
||||
#print twords
|
||||
subprocess.call(["gzip",
|
||||
"-dc",
|
||||
"{0}/models/tickets/.twords.gz".format(jgibbsLLDA_root)])
|
||||
#####################################################################################################################
|
||||
print()
|
||||
print()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -442,4 +602,10 @@ printRandomDoc(corpus)
|
|||
|
||||
|
||||
end = time.time()
|
||||
print("\n\n\nTime Elapsed:{0}".format(end - start))
|
||||
print("\n\n\nTime Elapsed Topic Modeling:{0}\n\n".format(end - start))
|
||||
|
||||
|
||||
|
||||
sys.stdout = old_stdout
|
||||
|
||||
log_file.close()
|
Loading…
Reference in New Issue