regex zum weiteren cleaning hinzugefügt

This commit is contained in:
jannis.grundmann 2017-09-15 14:32:44 +02:00
parent 13ec7cdef4
commit 092052dfe1
9 changed files with 358642 additions and 67 deletions

View File

@ -1,7 +1,19 @@
[default]
[filepath]
thesauruspath = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/openthesaurus.csv
path2xml = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/ticketSamples.xml
path2csv = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_2017-09-13.csv
small = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_small.csv
logfile = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/topicModelTickets.log
lemmas = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemmatization-de.txt
thesauruspath = openthesaurus.csv
path2xml = ticketSamples.xml
language = de
@ -9,7 +21,7 @@ language = de
ents2keep = WORK_OF_ART,ORG,PRODUCT,LOC
custom_words = grüßen,fragen
custom_words = grüßen,fragen,damen,probleme,herren,dank
#lemmatize = True

358473
lemmatization-de.txt Normal file

File diff suppressed because it is too large Load Diff

View File

@ -22,7 +22,7 @@ config = ConfigParser.ConfigParser()
with open("config.ini") as f:
config.read_file(f)
#todo print&log
path2xml = config.get("default","path2xml")
@ -281,7 +281,7 @@ topicModel = 'lda'
weighting = ('tf' if topicModel == 'lda' else 'tfidf')
top_topic_words = 5
top_document_labels_per_topic = 2
top_document_labels_per_topic = 5
n_topics = len(set(corpus[0].metadata.keys()))+1 #+1 wegen einem default-topic
@ -311,7 +311,7 @@ id2term = vectorizer.__getattribute__("id_to_term")
"""
##################### LSA, LDA, NMF Topic Modeling via Textacy ##############################################
# Initialize and train a topic model
@ -339,7 +339,7 @@ for topic_idx, top_docs in model.top_topic_docs(doc_topic_matrix, top_n=top_docu
print()
print()
"""

202
test.py
View File

@ -1,7 +1,11 @@
# -*- coding: utf-8 -*-
import time
start = time.time()
import logging
import csv
import functools
@ -16,45 +20,55 @@ import textacy
from scipy import *
from textacy import Vectorizer
import warnings
csv.field_size_limit(sys.maxsize)
import configparser as ConfigParser
import sys
old_stdout = sys.stdout
log_file = open("printout.log","w")
sys.stdout = log_file
csv.field_size_limit(sys.maxsize)
# Load the configuration file
import configparser as ConfigParser
config_ini = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/config.ini"
config = ConfigParser.ConfigParser()
with open("config.ini") as f:
with open(config_ini) as f:
config.read_file(f)
# config logging
logging.basicConfig(filename=config.get("filepath","logfile"), level=logging.INFO)
path2csv = "M42-Export/Tickets_small.csv"
#path2csv = "M42-Export/Tickets_2017-09-13.csv" # 21167
path2xml = config.get("default","path2xml")
thesauruspath = config.get("default","thesauruspath")
DE_PARSER = spacy.load("de")
de_stop_words=list(__import__("spacy." + DE_PARSER.lang, globals(), locals(), ['object']).STOP_WORDS)
thesauruspath = config.get("filepath","thesauruspath")
THESAURUS = list(textacy.fileio.read_csv(thesauruspath, delimiter=";"))
DE_PARSER = spacy.load("de") #todo spacherkennung idee: verschiedene Corpi für verschiedene Sprachen
de_stop_words=list(__import__("spacy." + DE_PARSER.lang, globals(), locals(), ['object']).STOP_WORDS)
LEMMAS=config.get("filepath","lemmas")
############# misc
def printlog(string, level="INFO"):
"""log and prints"""
print(string)
if level=="INFO":
logging.info(string)
elif level=="DEBUG":
logging.debug(string)
elif level == "WARNING":
logging.warning(string)
printlog("Load functions")
def compose(*functions):
def compose2(f, g):
return lambda x: f(g(x))
@ -84,17 +98,17 @@ def get_calling_function():
return func
raise AttributeError("func not found")
def printRandomDoc(textacyCorpus):
import random
print()
print("len(textacyCorpus) = %i" % len(textacyCorpus))
printlog("len(textacyCorpus) = %i" % len(textacyCorpus))
randIndex = int((len(textacyCorpus) - 1) * random.random())
print("Index: {0} ; Text: {1} ; Metadata: {2}".format(randIndex, textacyCorpus[randIndex].text, textacyCorpus[randIndex].metadata))
printlog("Index: {0} ; Text: {1} ; Metadata: {2}".format(randIndex, textacyCorpus[randIndex].text, textacyCorpus[randIndex].metadata))
print()
############# load xml
def generateMainTextfromTicketXML(path2xml, main_textfield='Description'):
"""
@ -111,6 +125,7 @@ def generateMainTextfromTicketXML(path2xml, main_textfield='Description'):
for field in ticket:
if field.tag == main_textfield:
yield field.text
def generateMetadatafromTicketXML(path2xml, leave_out=['Description']):
tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
root = tree.getroot()
@ -144,6 +159,7 @@ def csv_to_contentStream(path2csv: str, content_collumn_name: str):
content_collumn = j
else:
yield lst[content_collumn]
def csv_to_metaStream(path2csv: str, metalist: [str]):
"""
:param path2csv: string
@ -171,9 +187,8 @@ def csv_to_metaStream(path2csv: str, metalist: [str]):
############################################ Preprocessing ##############################################
print("############################################ Preprocessing ##############################################")
print()
############# on str-gen
@ -211,7 +226,11 @@ def processTextstream(textstream, funclist, parser=DE_PARSER):
pipe = parser.pipe(textstream)
for doc in pipe:
tokens = [tok for tok in doc]
tokens = []
for tok in doc:
tokens.append(tok)
tokens = processTokens(tokens,funclist,parser)
yield " ".join([tok.lower_ for tok in tokens])
@ -264,7 +283,6 @@ def removePOS(pos_list)-> bool:
return ret
def removeWords(words, keep=None)-> bool:
#todo in:str oder str-list
if hasattr(keep, '__iter__'):
for k in keep:
try:
@ -289,13 +307,37 @@ def removeENT(ent_list) -> bool:
ret.__annotations__ = get_calling_function().__annotations__
return ret
def lemmatize() -> str:
ret = lambda tok: tok.lemma_
def remove_words_containing_Numbers() -> bool:
ret = lambda tok: not bool(re.search('\d', tok.lower_))
ret.__annotations__ = get_calling_function().__annotations__
return ret
def remove_words_containing_specialCharacters() -> bool:
ret = lambda tok: not bool(re.search(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./<>?]', tok.lower_))
ret.__annotations__ = get_calling_function().__annotations__
return ret
def lemmatizeWord(word,filepath=LEMMAS):
"""http://www.lexiconista.com/datasets/lemmatization/"""
for line in list(textacy.fileio.read_file_lines(filepath=filepath)):
if word.lower() == line.split()[1].strip().lower():
return line.split()[0].strip().lower()
return word.lower() # falls nix gefunden wurde
def lemmatize() -> str:
ret = lambda tok: lemmatizeWord(tok.lower_)
ret.__annotations__ = get_calling_function().__annotations__
return ret
def normalizeSynonyms(default_return_first_Syn=False) -> str:
ret = lambda tok : getFirstSynonym(tok.lower_, default_return_first_Syn=default_return_first_Syn)
@ -377,6 +419,15 @@ def replacePhonenumbers(replace_with="PHONENUMBER") -> str:
return ret
def fixUnicode() -> str:
ret = lambda tok: textacy.preprocess.fix_bad_unicode(tok.lower_, normalization=u'NFC')
ret.__annotations__ = get_calling_function().__annotations__
return ret
def resolveAbbreviations():
pass #todo
@ -396,10 +447,44 @@ def lower() -> spacy.tokens.Doc:
return ret
################################################################################################################
path2xml = config.get("filepath","path2xml")
path2csv = config.get("filepath","path2csv")
path2csv = "M42-Export/Tickets_med.csv"
printlog("CSV: {0}".format(path2csv))
ticketcorpus = textacy.Corpus(DE_PARSER)
#idee ß zu ss ändern? prinzipiell?
"""
vllt kategorien in unterkategorien aufteilen
allg:
utf-korregieren, bei sonderzeichen wörter trennen
namen raus
emails, urls, nummern raus
vllt sogar alles, was ebend jenes enthält (oder auf .toplvldomain bzw. sonderzeichen enthält oder alles was ein @ enthält
sinnvoller wörter von müll trennen: 8203;verfügung
abkürzungen raus: m.a, o.ä.
sinnlose bsp: nr54065467 455a33c5 tvt?= ------problem--------
"""
metaliste = [
"Subject",
@ -409,45 +494,55 @@ metaliste = [
clean_in_meta = {
"Solution":[removePOS(["SPACE"]),lower()],
"Subject":[removePOS(["SPACE","PUNCT"]),lower()]
"Subject":[removePOS(["SPACE","PUNCT"]),lower()],
"categoryName": [removePOS(["SPACE", "PUNCT"]), lower()]
}
printlog("Start Preprocessing")
clean_in_content=[
#removePOS(["SPACE","PUNCT","NUM"]),
keepPOS(["NOUN"]),
removePOS(["SPACE","PUNCT","NUM"]),
remove_words_containing_Numbers(),
remove_words_containing_specialCharacters(),
#replaceURLs(),
#replaceEmails(),
lemmatize(),
#removeWords(de_stop_words),
keepUniqeTokens(),
removePOS(["PUNCT"]),
#fixUnicode(),
#lemmatize(),
#removeWords(de_stop_words + config.get("preprocessing","custom_words").split(",")),
#removeENT("PERSON"),
#keepPOS(["NOUN"]),
#keepUniqeTokens(),
#keepENT(config.get("preprocessing","ents2keep"))
]
## add files to textacy-corpus,
print("add texts to textacy-corpus...")
printlog("add texts to textacy-corpus")
ticketcorpus.add_texts(
processTextstream(csv_to_contentStream(path2csv,"Description"), clean_in_content),
processDictstream(csv_to_metaStream(path2csv,metaliste),clean_in_meta)
)
printRandomDoc(ticketcorpus)
for i in range(10):
printRandomDoc(ticketcorpus)
end = time.time()
print("\n\nTime Elapsed Preprocessing:{0}\n\n".format(end - start))
printlog("Time Elapsed Preprocessing:{0} min".format((end - start)/60))
print("############################################ Topic Modeling #############################################")
############################################ Topic Modeling #############################################
print("\n\n")
start = time.time()
@ -482,7 +577,7 @@ def generate_labled_lines(textacyCorpus):
####################'####################' todo alles in config
ngrams = (1)
ngrams = 1
min_df = 0
max_df = 1.0
@ -508,7 +603,7 @@ n_topics = len(LABELDICT)#len(set(ticketcorpus[0].metadata.keys()))+1 #+1 wegen
print("\nvectorize corpus...")
printlog("vectorize corpus...")
vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df)
terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=False, as_strings=True) for doc in ticketcorpus)
@ -528,12 +623,12 @@ id2term = vectorizer.__getattribute__("id_to_term")
##################### LSA, LDA, NMF Topic Modeling via Textacy ##############################################
# Initialize and train a topic model
print("\nInitialize and train a topic model..")
printlog("Initialize and train a topic model..")
model = textacy.tm.TopicModel(topicModel, n_topics=n_topics)
model.fit(doc_term_matrix)
#Transform the corpus and interpret our model:
print("Transform the corpus and interpret our model..")
printlog("Transform the corpus and interpret our model..")
doc_topic_matrix = model.transform(doc_term_matrix)
print()
@ -552,7 +647,7 @@ for topic_idx, top_docs in model.top_topic_docs(doc_topic_matrix, top_n=top_docu
print()
print()
"""
@ -560,19 +655,20 @@ print()
jgibbsLLDA_root = "java_LabledLDA/"
filepath = "{0}models/tickets/tickets.gz".format(jgibbsLLDA_root)
jgibbsLLDA_root = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/java_LabledLDA/"
LLDA_filepath = "{0}models/tickets/tickets.gz".format(jgibbsLLDA_root)
#create file
textacy.fileio.write_file_lines(generate_labled_lines(ticketcorpus),filepath=filepath)
textacy.fileio.write_file_lines(generate_labled_lines(ticketcorpus), filepath=LLDA_filepath)
# wait for file to exist
while not os.path.exists(filepath):
while not os.path.exists(LLDA_filepath):
time.sleep(1)
print("\nstart LLDA:\n")
print("\n\n")
printlog("start LLDA:")
#run JGibsslda file
FNULL = open(os.devnull, 'w') # supress output
subprocess.call(["java",
@ -586,7 +682,7 @@ subprocess.call(["java",
# ANMERKUNG: Dateien sind versteckt. zu finden in models/
#print twords
#twords
subprocess.call(["gzip",
"-dc",
"{0}/models/tickets/.twords.gz".format(jgibbsLLDA_root)])
@ -594,7 +690,7 @@ subprocess.call(["gzip",
print()
print()
"""
@ -602,10 +698,4 @@ print()
end = time.time()
print("\n\n\nTime Elapsed Topic Modeling:{0}\n\n".format(end - start))
sys.stdout = old_stdout
log_file.close()
printlog("\n\n\nTime Elapsed Topic Modeling:{0}\n\n".format(end - start))