regex zum weiteren cleaning hinzugefügt

This commit is contained in:
jannis.grundmann 2017-09-15 14:32:44 +02:00
parent 13ec7cdef4
commit 092052dfe1
9 changed files with 358642 additions and 67 deletions

View File

@ -1,7 +1,19 @@
[default] [filepath]
thesauruspath = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/openthesaurus.csv
path2xml = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/ticketSamples.xml
path2csv = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_2017-09-13.csv
small = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_small.csv
logfile = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/topicModelTickets.log
lemmas = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemmatization-de.txt
thesauruspath = openthesaurus.csv
path2xml = ticketSamples.xml
language = de language = de
@ -9,7 +21,7 @@ language = de
ents2keep = WORK_OF_ART,ORG,PRODUCT,LOC ents2keep = WORK_OF_ART,ORG,PRODUCT,LOC
custom_words = grüßen,fragen custom_words = grüßen,fragen,damen,probleme,herren,dank
#lemmatize = True #lemmatize = True

358473
lemmatization-de.txt Normal file

File diff suppressed because it is too large Load Diff

View File

@ -22,7 +22,7 @@ config = ConfigParser.ConfigParser()
with open("config.ini") as f: with open("config.ini") as f:
config.read_file(f) config.read_file(f)
#todo print&log
path2xml = config.get("default","path2xml") path2xml = config.get("default","path2xml")
@ -281,7 +281,7 @@ topicModel = 'lda'
weighting = ('tf' if topicModel == 'lda' else 'tfidf') weighting = ('tf' if topicModel == 'lda' else 'tfidf')
top_topic_words = 5 top_topic_words = 5
top_document_labels_per_topic = 2 top_document_labels_per_topic = 5
n_topics = len(set(corpus[0].metadata.keys()))+1 #+1 wegen einem default-topic n_topics = len(set(corpus[0].metadata.keys()))+1 #+1 wegen einem default-topic
@ -311,7 +311,7 @@ id2term = vectorizer.__getattribute__("id_to_term")
"""
##################### LSA, LDA, NMF Topic Modeling via Textacy ############################################## ##################### LSA, LDA, NMF Topic Modeling via Textacy ##############################################
# Initialize and train a topic model # Initialize and train a topic model
@ -339,7 +339,7 @@ for topic_idx, top_docs in model.top_topic_docs(doc_topic_matrix, top_n=top_docu
print() print()
print() print()
"""

208
test.py
View File

@ -1,7 +1,11 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import time import time
start = time.time() start = time.time()
import logging
import csv import csv
import functools import functools
@ -16,45 +20,55 @@ import textacy
from scipy import * from scipy import *
from textacy import Vectorizer from textacy import Vectorizer
import warnings import warnings
csv.field_size_limit(sys.maxsize) import configparser as ConfigParser
import sys import sys
old_stdout = sys.stdout csv.field_size_limit(sys.maxsize)
log_file = open("printout.log","w")
sys.stdout = log_file
# Load the configuration file # Load the configuration file
import configparser as ConfigParser config_ini = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/config.ini"
config = ConfigParser.ConfigParser() config = ConfigParser.ConfigParser()
with open("config.ini") as f: with open(config_ini) as f:
config.read_file(f) config.read_file(f)
# config logging
logging.basicConfig(filename=config.get("filepath","logfile"), level=logging.INFO)
path2csv = "M42-Export/Tickets_small.csv"
#path2csv = "M42-Export/Tickets_2017-09-13.csv" # 21167
path2xml = config.get("default","path2xml")
thesauruspath = config.get("default","thesauruspath")
DE_PARSER = spacy.load("de")
de_stop_words=list(__import__("spacy." + DE_PARSER.lang, globals(), locals(), ['object']).STOP_WORDS)
thesauruspath = config.get("filepath","thesauruspath")
THESAURUS = list(textacy.fileio.read_csv(thesauruspath, delimiter=";")) THESAURUS = list(textacy.fileio.read_csv(thesauruspath, delimiter=";"))
DE_PARSER = spacy.load("de") #todo spacherkennung idee: verschiedene Corpi für verschiedene Sprachen
de_stop_words=list(__import__("spacy." + DE_PARSER.lang, globals(), locals(), ['object']).STOP_WORDS)
LEMMAS=config.get("filepath","lemmas")
############# misc ############# misc
def printlog(string, level="INFO"):
"""log and prints"""
print(string)
if level=="INFO":
logging.info(string)
elif level=="DEBUG":
logging.debug(string)
elif level == "WARNING":
logging.warning(string)
printlog("Load functions")
def compose(*functions): def compose(*functions):
def compose2(f, g): def compose2(f, g):
return lambda x: f(g(x)) return lambda x: f(g(x))
@ -84,17 +98,17 @@ def get_calling_function():
return func return func
raise AttributeError("func not found") raise AttributeError("func not found")
def printRandomDoc(textacyCorpus): def printRandomDoc(textacyCorpus):
import random import random
print() print()
print("len(textacyCorpus) = %i" % len(textacyCorpus)) printlog("len(textacyCorpus) = %i" % len(textacyCorpus))
randIndex = int((len(textacyCorpus) - 1) * random.random()) randIndex = int((len(textacyCorpus) - 1) * random.random())
print("Index: {0} ; Text: {1} ; Metadata: {2}".format(randIndex, textacyCorpus[randIndex].text, textacyCorpus[randIndex].metadata)) printlog("Index: {0} ; Text: {1} ; Metadata: {2}".format(randIndex, textacyCorpus[randIndex].text, textacyCorpus[randIndex].metadata))
print() print()
############# load xml ############# load xml
def generateMainTextfromTicketXML(path2xml, main_textfield='Description'): def generateMainTextfromTicketXML(path2xml, main_textfield='Description'):
""" """
@ -111,6 +125,7 @@ def generateMainTextfromTicketXML(path2xml, main_textfield='Description'):
for field in ticket: for field in ticket:
if field.tag == main_textfield: if field.tag == main_textfield:
yield field.text yield field.text
def generateMetadatafromTicketXML(path2xml, leave_out=['Description']): def generateMetadatafromTicketXML(path2xml, leave_out=['Description']):
tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8")) tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
root = tree.getroot() root = tree.getroot()
@ -144,6 +159,7 @@ def csv_to_contentStream(path2csv: str, content_collumn_name: str):
content_collumn = j content_collumn = j
else: else:
yield lst[content_collumn] yield lst[content_collumn]
def csv_to_metaStream(path2csv: str, metalist: [str]): def csv_to_metaStream(path2csv: str, metalist: [str]):
""" """
:param path2csv: string :param path2csv: string
@ -171,9 +187,8 @@ def csv_to_metaStream(path2csv: str, metalist: [str]):
############################################ Preprocessing ##############################################
print("############################################ Preprocessing ##############################################")
print()
############# on str-gen ############# on str-gen
@ -211,7 +226,11 @@ def processTextstream(textstream, funclist, parser=DE_PARSER):
pipe = parser.pipe(textstream) pipe = parser.pipe(textstream)
for doc in pipe: for doc in pipe:
tokens = [tok for tok in doc]
tokens = []
for tok in doc:
tokens.append(tok)
tokens = processTokens(tokens,funclist,parser) tokens = processTokens(tokens,funclist,parser)
yield " ".join([tok.lower_ for tok in tokens]) yield " ".join([tok.lower_ for tok in tokens])
@ -264,7 +283,6 @@ def removePOS(pos_list)-> bool:
return ret return ret
def removeWords(words, keep=None)-> bool: def removeWords(words, keep=None)-> bool:
#todo in:str oder str-list
if hasattr(keep, '__iter__'): if hasattr(keep, '__iter__'):
for k in keep: for k in keep:
try: try:
@ -289,13 +307,37 @@ def removeENT(ent_list) -> bool:
ret.__annotations__ = get_calling_function().__annotations__ ret.__annotations__ = get_calling_function().__annotations__
return ret return ret
def lemmatize() -> str: def remove_words_containing_Numbers() -> bool:
ret = lambda tok: tok.lemma_ ret = lambda tok: not bool(re.search('\d', tok.lower_))
ret.__annotations__ = get_calling_function().__annotations__ ret.__annotations__ = get_calling_function().__annotations__
return ret return ret
def remove_words_containing_specialCharacters() -> bool:
ret = lambda tok: not bool(re.search(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./<>?]', tok.lower_))
ret.__annotations__ = get_calling_function().__annotations__
return ret
def lemmatizeWord(word,filepath=LEMMAS):
"""http://www.lexiconista.com/datasets/lemmatization/"""
for line in list(textacy.fileio.read_file_lines(filepath=filepath)):
if word.lower() == line.split()[1].strip().lower():
return line.split()[0].strip().lower()
return word.lower() # falls nix gefunden wurde
def lemmatize() -> str:
ret = lambda tok: lemmatizeWord(tok.lower_)
ret.__annotations__ = get_calling_function().__annotations__
return ret
def normalizeSynonyms(default_return_first_Syn=False) -> str: def normalizeSynonyms(default_return_first_Syn=False) -> str:
ret = lambda tok : getFirstSynonym(tok.lower_, default_return_first_Syn=default_return_first_Syn) ret = lambda tok : getFirstSynonym(tok.lower_, default_return_first_Syn=default_return_first_Syn)
@ -377,6 +419,15 @@ def replacePhonenumbers(replace_with="PHONENUMBER") -> str:
return ret return ret
def fixUnicode() -> str:
ret = lambda tok: textacy.preprocess.fix_bad_unicode(tok.lower_, normalization=u'NFC')
ret.__annotations__ = get_calling_function().__annotations__
return ret
def resolveAbbreviations(): def resolveAbbreviations():
pass #todo pass #todo
@ -396,10 +447,44 @@ def lower() -> spacy.tokens.Doc:
return ret return ret
################################################################################################################
path2xml = config.get("filepath","path2xml")
path2csv = config.get("filepath","path2csv")
path2csv = "M42-Export/Tickets_med.csv"
printlog("CSV: {0}".format(path2csv))
ticketcorpus = textacy.Corpus(DE_PARSER) ticketcorpus = textacy.Corpus(DE_PARSER)
#idee ß zu ss ändern? prinzipiell?
"""
vllt kategorien in unterkategorien aufteilen
allg:
utf-korregieren, bei sonderzeichen wörter trennen
namen raus
emails, urls, nummern raus
vllt sogar alles, was ebend jenes enthält (oder auf .toplvldomain bzw. sonderzeichen enthält oder alles was ein @ enthält
sinnvoller wörter von müll trennen: 8203;verfügung
abkürzungen raus: m.a, o.ä.
sinnlose bsp: nr54065467 455a33c5 tvt?= ------problem--------
"""
metaliste = [ metaliste = [
"Subject", "Subject",
@ -409,45 +494,55 @@ metaliste = [
clean_in_meta = { clean_in_meta = {
"Solution":[removePOS(["SPACE"]),lower()], "Solution":[removePOS(["SPACE"]),lower()],
"Subject":[removePOS(["SPACE","PUNCT"]),lower()] "Subject":[removePOS(["SPACE","PUNCT"]),lower()],
"categoryName": [removePOS(["SPACE", "PUNCT"]), lower()]
} }
printlog("Start Preprocessing")
clean_in_content=[ clean_in_content=[
#removePOS(["SPACE","PUNCT","NUM"]),
keepPOS(["NOUN"]),
removePOS(["SPACE","PUNCT","NUM"]), removePOS(["SPACE","PUNCT","NUM"]),
remove_words_containing_Numbers(),
remove_words_containing_specialCharacters(),
#replaceURLs(), #replaceURLs(),
#replaceEmails(), #replaceEmails(),
lemmatize(), #fixUnicode(),
#removeWords(de_stop_words),
keepUniqeTokens(), #lemmatize(),
removePOS(["PUNCT"]), #removeWords(de_stop_words + config.get("preprocessing","custom_words").split(",")),
#removeENT("PERSON"),
#keepPOS(["NOUN"]),
#keepUniqeTokens(),
#keepENT(config.get("preprocessing","ents2keep"))
] ]
## add files to textacy-corpus, ## add files to textacy-corpus,
print("add texts to textacy-corpus...") printlog("add texts to textacy-corpus")
ticketcorpus.add_texts( ticketcorpus.add_texts(
processTextstream(csv_to_contentStream(path2csv,"Description"), clean_in_content), processTextstream(csv_to_contentStream(path2csv,"Description"), clean_in_content),
processDictstream(csv_to_metaStream(path2csv,metaliste),clean_in_meta) processDictstream(csv_to_metaStream(path2csv,metaliste),clean_in_meta)
) )
printRandomDoc(ticketcorpus) for i in range(10):
printRandomDoc(ticketcorpus)
end = time.time() end = time.time()
print("\n\nTime Elapsed Preprocessing:{0}\n\n".format(end - start)) printlog("Time Elapsed Preprocessing:{0} min".format((end - start)/60))
print("############################################ Topic Modeling #############################################") ############################################ Topic Modeling #############################################
print("\n\n") print("\n\n")
start = time.time() start = time.time()
@ -482,7 +577,7 @@ def generate_labled_lines(textacyCorpus):
####################'####################' todo alles in config ####################'####################' todo alles in config
ngrams = (1) ngrams = 1
min_df = 0 min_df = 0
max_df = 1.0 max_df = 1.0
@ -508,7 +603,7 @@ n_topics = len(LABELDICT)#len(set(ticketcorpus[0].metadata.keys()))+1 #+1 wegen
print("\nvectorize corpus...") printlog("vectorize corpus...")
vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df) vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df)
terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=False, as_strings=True) for doc in ticketcorpus) terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=False, as_strings=True) for doc in ticketcorpus)
@ -528,12 +623,12 @@ id2term = vectorizer.__getattribute__("id_to_term")
##################### LSA, LDA, NMF Topic Modeling via Textacy ############################################## ##################### LSA, LDA, NMF Topic Modeling via Textacy ##############################################
# Initialize and train a topic model # Initialize and train a topic model
print("\nInitialize and train a topic model..") printlog("Initialize and train a topic model..")
model = textacy.tm.TopicModel(topicModel, n_topics=n_topics) model = textacy.tm.TopicModel(topicModel, n_topics=n_topics)
model.fit(doc_term_matrix) model.fit(doc_term_matrix)
#Transform the corpus and interpret our model: #Transform the corpus and interpret our model:
print("Transform the corpus and interpret our model..") printlog("Transform the corpus and interpret our model..")
doc_topic_matrix = model.transform(doc_term_matrix) doc_topic_matrix = model.transform(doc_term_matrix)
print() print()
@ -544,15 +639,15 @@ for topic_idx, top_terms in model.top_topic_terms(vectorizer.id_to_term, top_n=t
print() print()
for topic_idx, top_docs in model.top_topic_docs(doc_topic_matrix, top_n=top_document_labels_per_topic): for topic_idx, top_docs in model.top_topic_docs(doc_topic_matrix, top_n=top_document_labels_per_topic):
print(topic_idx) print(topic_idx)
for j in top_docs: for j in top_docs:
print(ticketcorpus[j].metadata['categoryName']) print(ticketcorpus[j].metadata['categoryName'])
##################################################################################################################### #####################################################################################################################
print() print()
print() print()
"""
@ -560,19 +655,20 @@ print()
jgibbsLLDA_root = "java_LabledLDA/" jgibbsLLDA_root = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/java_LabledLDA/"
filepath = "{0}models/tickets/tickets.gz".format(jgibbsLLDA_root) LLDA_filepath = "{0}models/tickets/tickets.gz".format(jgibbsLLDA_root)
#create file #create file
textacy.fileio.write_file_lines(generate_labled_lines(ticketcorpus),filepath=filepath) textacy.fileio.write_file_lines(generate_labled_lines(ticketcorpus), filepath=LLDA_filepath)
# wait for file to exist # wait for file to exist
while not os.path.exists(filepath): while not os.path.exists(LLDA_filepath):
time.sleep(1) time.sleep(1)
print("\nstart LLDA:\n") print("\n\n")
printlog("start LLDA:")
#run JGibsslda file #run JGibsslda file
FNULL = open(os.devnull, 'w') # supress output FNULL = open(os.devnull, 'w') # supress output
subprocess.call(["java", subprocess.call(["java",
@ -586,7 +682,7 @@ subprocess.call(["java",
# ANMERKUNG: Dateien sind versteckt. zu finden in models/ # ANMERKUNG: Dateien sind versteckt. zu finden in models/
#print twords #twords
subprocess.call(["gzip", subprocess.call(["gzip",
"-dc", "-dc",
"{0}/models/tickets/.twords.gz".format(jgibbsLLDA_root)]) "{0}/models/tickets/.twords.gz".format(jgibbsLLDA_root)])
@ -594,7 +690,7 @@ subprocess.call(["gzip",
print() print()
print() print()
"""
@ -602,10 +698,4 @@ print()
end = time.time() end = time.time()
print("\n\n\nTime Elapsed Topic Modeling:{0}\n\n".format(end - start)) printlog("\n\n\nTime Elapsed Topic Modeling:{0}\n\n".format(end - start))
sys.stdout = old_stdout
log_file.close()