regex zum weiteren cleaning hinzugefügt
This commit is contained in:
parent
13ec7cdef4
commit
092052dfe1
20
config.ini
20
config.ini
|
@ -1,7 +1,19 @@
|
|||
[default]
|
||||
[filepath]
|
||||
|
||||
thesauruspath = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/openthesaurus.csv
|
||||
|
||||
path2xml = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/ticketSamples.xml
|
||||
|
||||
path2csv = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_2017-09-13.csv
|
||||
|
||||
small = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_small.csv
|
||||
|
||||
|
||||
logfile = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/topicModelTickets.log
|
||||
|
||||
lemmas = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemmatization-de.txt
|
||||
|
||||
|
||||
thesauruspath = openthesaurus.csv
|
||||
path2xml = ticketSamples.xml
|
||||
|
||||
language = de
|
||||
|
||||
|
@ -9,7 +21,7 @@ language = de
|
|||
|
||||
ents2keep = WORK_OF_ART,ORG,PRODUCT,LOC
|
||||
|
||||
custom_words = grüßen,fragen
|
||||
custom_words = grüßen,fragen,damen,probleme,herren,dank
|
||||
|
||||
#lemmatize = True
|
||||
|
||||
|
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
File diff suppressed because it is too large
Load Diff
|
@ -22,7 +22,7 @@ config = ConfigParser.ConfigParser()
|
|||
with open("config.ini") as f:
|
||||
config.read_file(f)
|
||||
|
||||
|
||||
#todo print&log
|
||||
|
||||
path2xml = config.get("default","path2xml")
|
||||
|
||||
|
@ -281,7 +281,7 @@ topicModel = 'lda'
|
|||
weighting = ('tf' if topicModel == 'lda' else 'tfidf')
|
||||
|
||||
top_topic_words = 5
|
||||
top_document_labels_per_topic = 2
|
||||
top_document_labels_per_topic = 5
|
||||
|
||||
n_topics = len(set(corpus[0].metadata.keys()))+1 #+1 wegen einem default-topic
|
||||
|
||||
|
@ -311,7 +311,7 @@ id2term = vectorizer.__getattribute__("id_to_term")
|
|||
|
||||
|
||||
|
||||
|
||||
"""
|
||||
##################### LSA, LDA, NMF Topic Modeling via Textacy ##############################################
|
||||
|
||||
# Initialize and train a topic model
|
||||
|
@ -339,7 +339,7 @@ for topic_idx, top_docs in model.top_topic_docs(doc_topic_matrix, top_n=top_docu
|
|||
print()
|
||||
print()
|
||||
|
||||
|
||||
"""
|
||||
|
||||
|
||||
|
||||
|
|
200
test.py
200
test.py
|
@ -1,7 +1,11 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
|
||||
|
||||
import time
|
||||
start = time.time()
|
||||
|
||||
import logging
|
||||
|
||||
import csv
|
||||
import functools
|
||||
|
@ -16,45 +20,55 @@ import textacy
|
|||
from scipy import *
|
||||
from textacy import Vectorizer
|
||||
import warnings
|
||||
csv.field_size_limit(sys.maxsize)
|
||||
|
||||
import configparser as ConfigParser
|
||||
import sys
|
||||
|
||||
|
||||
old_stdout = sys.stdout
|
||||
|
||||
log_file = open("printout.log","w")
|
||||
|
||||
sys.stdout = log_file
|
||||
csv.field_size_limit(sys.maxsize)
|
||||
|
||||
|
||||
|
||||
|
||||
# Load the configuration file
|
||||
import configparser as ConfigParser
|
||||
config_ini = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/config.ini"
|
||||
|
||||
config = ConfigParser.ConfigParser()
|
||||
with open("config.ini") as f:
|
||||
with open(config_ini) as f:
|
||||
config.read_file(f)
|
||||
|
||||
|
||||
|
||||
|
||||
# config logging
|
||||
logging.basicConfig(filename=config.get("filepath","logfile"), level=logging.INFO)
|
||||
|
||||
path2csv = "M42-Export/Tickets_small.csv"
|
||||
#path2csv = "M42-Export/Tickets_2017-09-13.csv" # 21167
|
||||
|
||||
path2xml = config.get("default","path2xml")
|
||||
thesauruspath = config.get("default","thesauruspath")
|
||||
|
||||
DE_PARSER = spacy.load("de")
|
||||
de_stop_words=list(__import__("spacy." + DE_PARSER.lang, globals(), locals(), ['object']).STOP_WORDS)
|
||||
|
||||
thesauruspath = config.get("filepath","thesauruspath")
|
||||
THESAURUS = list(textacy.fileio.read_csv(thesauruspath, delimiter=";"))
|
||||
|
||||
|
||||
DE_PARSER = spacy.load("de") #todo spacherkennung idee: verschiedene Corpi für verschiedene Sprachen
|
||||
de_stop_words=list(__import__("spacy." + DE_PARSER.lang, globals(), locals(), ['object']).STOP_WORDS)
|
||||
|
||||
|
||||
LEMMAS=config.get("filepath","lemmas")
|
||||
|
||||
|
||||
|
||||
############# misc
|
||||
|
||||
def printlog(string, level="INFO"):
|
||||
"""log and prints"""
|
||||
print(string)
|
||||
if level=="INFO":
|
||||
logging.info(string)
|
||||
elif level=="DEBUG":
|
||||
logging.debug(string)
|
||||
elif level == "WARNING":
|
||||
logging.warning(string)
|
||||
printlog("Load functions")
|
||||
|
||||
def compose(*functions):
|
||||
def compose2(f, g):
|
||||
return lambda x: f(g(x))
|
||||
|
@ -84,17 +98,17 @@ def get_calling_function():
|
|||
return func
|
||||
raise AttributeError("func not found")
|
||||
|
||||
|
||||
def printRandomDoc(textacyCorpus):
|
||||
import random
|
||||
print()
|
||||
|
||||
print("len(textacyCorpus) = %i" % len(textacyCorpus))
|
||||
printlog("len(textacyCorpus) = %i" % len(textacyCorpus))
|
||||
randIndex = int((len(textacyCorpus) - 1) * random.random())
|
||||
print("Index: {0} ; Text: {1} ; Metadata: {2}".format(randIndex, textacyCorpus[randIndex].text, textacyCorpus[randIndex].metadata))
|
||||
printlog("Index: {0} ; Text: {1} ; Metadata: {2}".format(randIndex, textacyCorpus[randIndex].text, textacyCorpus[randIndex].metadata))
|
||||
|
||||
print()
|
||||
|
||||
|
||||
############# load xml
|
||||
def generateMainTextfromTicketXML(path2xml, main_textfield='Description'):
|
||||
"""
|
||||
|
@ -111,6 +125,7 @@ def generateMainTextfromTicketXML(path2xml, main_textfield='Description'):
|
|||
for field in ticket:
|
||||
if field.tag == main_textfield:
|
||||
yield field.text
|
||||
|
||||
def generateMetadatafromTicketXML(path2xml, leave_out=['Description']):
|
||||
tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
|
||||
root = tree.getroot()
|
||||
|
@ -144,6 +159,7 @@ def csv_to_contentStream(path2csv: str, content_collumn_name: str):
|
|||
content_collumn = j
|
||||
else:
|
||||
yield lst[content_collumn]
|
||||
|
||||
def csv_to_metaStream(path2csv: str, metalist: [str]):
|
||||
"""
|
||||
:param path2csv: string
|
||||
|
@ -171,9 +187,8 @@ def csv_to_metaStream(path2csv: str, metalist: [str]):
|
|||
|
||||
|
||||
|
||||
############################################ Preprocessing ##############################################
|
||||
|
||||
print("############################################ Preprocessing ##############################################")
|
||||
print()
|
||||
|
||||
############# on str-gen
|
||||
|
||||
|
@ -211,7 +226,11 @@ def processTextstream(textstream, funclist, parser=DE_PARSER):
|
|||
pipe = parser.pipe(textstream)
|
||||
|
||||
for doc in pipe:
|
||||
tokens = [tok for tok in doc]
|
||||
|
||||
tokens = []
|
||||
for tok in doc:
|
||||
tokens.append(tok)
|
||||
|
||||
tokens = processTokens(tokens,funclist,parser)
|
||||
yield " ".join([tok.lower_ for tok in tokens])
|
||||
|
||||
|
@ -264,7 +283,6 @@ def removePOS(pos_list)-> bool:
|
|||
return ret
|
||||
|
||||
def removeWords(words, keep=None)-> bool:
|
||||
#todo in:str oder str-list
|
||||
if hasattr(keep, '__iter__'):
|
||||
for k in keep:
|
||||
try:
|
||||
|
@ -289,13 +307,37 @@ def removeENT(ent_list) -> bool:
|
|||
ret.__annotations__ = get_calling_function().__annotations__
|
||||
return ret
|
||||
|
||||
def lemmatize() -> str:
|
||||
ret = lambda tok: tok.lemma_
|
||||
def remove_words_containing_Numbers() -> bool:
|
||||
ret = lambda tok: not bool(re.search('\d', tok.lower_))
|
||||
|
||||
ret.__annotations__ = get_calling_function().__annotations__
|
||||
return ret
|
||||
|
||||
|
||||
def remove_words_containing_specialCharacters() -> bool:
|
||||
ret = lambda tok: not bool(re.search(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./<>?]', tok.lower_))
|
||||
|
||||
ret.__annotations__ = get_calling_function().__annotations__
|
||||
return ret
|
||||
|
||||
def lemmatizeWord(word,filepath=LEMMAS):
|
||||
"""http://www.lexiconista.com/datasets/lemmatization/"""
|
||||
for line in list(textacy.fileio.read_file_lines(filepath=filepath)):
|
||||
if word.lower() == line.split()[1].strip().lower():
|
||||
return line.split()[0].strip().lower()
|
||||
return word.lower() # falls nix gefunden wurde
|
||||
|
||||
def lemmatize() -> str:
|
||||
ret = lambda tok: lemmatizeWord(tok.lower_)
|
||||
|
||||
ret.__annotations__ = get_calling_function().__annotations__
|
||||
return ret
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def normalizeSynonyms(default_return_first_Syn=False) -> str:
|
||||
ret = lambda tok : getFirstSynonym(tok.lower_, default_return_first_Syn=default_return_first_Syn)
|
||||
|
||||
|
@ -377,6 +419,15 @@ def replacePhonenumbers(replace_with="PHONENUMBER") -> str:
|
|||
return ret
|
||||
|
||||
|
||||
|
||||
|
||||
def fixUnicode() -> str:
|
||||
ret = lambda tok: textacy.preprocess.fix_bad_unicode(tok.lower_, normalization=u'NFC')
|
||||
|
||||
ret.__annotations__ = get_calling_function().__annotations__
|
||||
return ret
|
||||
|
||||
|
||||
def resolveAbbreviations():
|
||||
pass #todo
|
||||
|
||||
|
@ -396,10 +447,44 @@ def lower() -> spacy.tokens.Doc:
|
|||
return ret
|
||||
|
||||
|
||||
################################################################################################################
|
||||
|
||||
|
||||
path2xml = config.get("filepath","path2xml")
|
||||
|
||||
|
||||
|
||||
path2csv = config.get("filepath","path2csv")
|
||||
path2csv = "M42-Export/Tickets_med.csv"
|
||||
printlog("CSV: {0}".format(path2csv))
|
||||
|
||||
|
||||
ticketcorpus = textacy.Corpus(DE_PARSER)
|
||||
|
||||
|
||||
#idee ß zu ss ändern? prinzipiell?
|
||||
|
||||
"""
|
||||
vllt kategorien in unterkategorien aufteilen
|
||||
|
||||
allg:
|
||||
utf-korregieren, bei sonderzeichen wörter trennen
|
||||
namen raus
|
||||
|
||||
emails, urls, nummern raus
|
||||
vllt sogar alles, was ebend jenes enthält (oder auf .toplvldomain bzw. sonderzeichen enthält oder alles was ein @ enthält
|
||||
|
||||
sinnvoller wörter von müll trennen: 8203;verfügung
|
||||
|
||||
abkürzungen raus: m.a, o.ä.
|
||||
|
||||
|
||||
sinnlose bsp: nr54065467 455a33c5 tvt?= ------problem--------
|
||||
|
||||
"""
|
||||
|
||||
|
||||
|
||||
|
||||
metaliste = [
|
||||
"Subject",
|
||||
|
@ -409,45 +494,55 @@ metaliste = [
|
|||
|
||||
clean_in_meta = {
|
||||
"Solution":[removePOS(["SPACE"]),lower()],
|
||||
"Subject":[removePOS(["SPACE","PUNCT"]),lower()]
|
||||
"Subject":[removePOS(["SPACE","PUNCT"]),lower()],
|
||||
"categoryName": [removePOS(["SPACE", "PUNCT"]), lower()]
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
printlog("Start Preprocessing")
|
||||
|
||||
clean_in_content=[
|
||||
#removePOS(["SPACE","PUNCT","NUM"]),
|
||||
keepPOS(["NOUN"]),
|
||||
|
||||
removePOS(["SPACE","PUNCT","NUM"]),
|
||||
remove_words_containing_Numbers(),
|
||||
remove_words_containing_specialCharacters(),
|
||||
|
||||
#replaceURLs(),
|
||||
#replaceEmails(),
|
||||
lemmatize(),
|
||||
#removeWords(de_stop_words),
|
||||
keepUniqeTokens(),
|
||||
removePOS(["PUNCT"]),
|
||||
#fixUnicode(),
|
||||
|
||||
#lemmatize(),
|
||||
#removeWords(de_stop_words + config.get("preprocessing","custom_words").split(",")),
|
||||
|
||||
#removeENT("PERSON"),
|
||||
#keepPOS(["NOUN"]),
|
||||
#keepUniqeTokens(),
|
||||
#keepENT(config.get("preprocessing","ents2keep"))
|
||||
|
||||
]
|
||||
|
||||
|
||||
|
||||
## add files to textacy-corpus,
|
||||
print("add texts to textacy-corpus...")
|
||||
printlog("add texts to textacy-corpus")
|
||||
ticketcorpus.add_texts(
|
||||
processTextstream(csv_to_contentStream(path2csv,"Description"), clean_in_content),
|
||||
processDictstream(csv_to_metaStream(path2csv,metaliste),clean_in_meta)
|
||||
)
|
||||
|
||||
for i in range(10):
|
||||
printRandomDoc(ticketcorpus)
|
||||
|
||||
|
||||
end = time.time()
|
||||
print("\n\nTime Elapsed Preprocessing:{0}\n\n".format(end - start))
|
||||
printlog("Time Elapsed Preprocessing:{0} min".format((end - start)/60))
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
print("############################################ Topic Modeling #############################################")
|
||||
############################################ Topic Modeling #############################################
|
||||
print("\n\n")
|
||||
start = time.time()
|
||||
|
||||
|
@ -482,7 +577,7 @@ def generate_labled_lines(textacyCorpus):
|
|||
|
||||
####################'####################' todo alles in config
|
||||
|
||||
ngrams = (1)
|
||||
ngrams = 1
|
||||
|
||||
min_df = 0
|
||||
max_df = 1.0
|
||||
|
@ -508,7 +603,7 @@ n_topics = len(LABELDICT)#len(set(ticketcorpus[0].metadata.keys()))+1 #+1 wegen
|
|||
|
||||
|
||||
|
||||
print("\nvectorize corpus...")
|
||||
printlog("vectorize corpus...")
|
||||
vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df)
|
||||
|
||||
terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=False, as_strings=True) for doc in ticketcorpus)
|
||||
|
@ -528,12 +623,12 @@ id2term = vectorizer.__getattribute__("id_to_term")
|
|||
##################### LSA, LDA, NMF Topic Modeling via Textacy ##############################################
|
||||
|
||||
# Initialize and train a topic model
|
||||
print("\nInitialize and train a topic model..")
|
||||
printlog("Initialize and train a topic model..")
|
||||
model = textacy.tm.TopicModel(topicModel, n_topics=n_topics)
|
||||
model.fit(doc_term_matrix)
|
||||
|
||||
#Transform the corpus and interpret our model:
|
||||
print("Transform the corpus and interpret our model..")
|
||||
printlog("Transform the corpus and interpret our model..")
|
||||
doc_topic_matrix = model.transform(doc_term_matrix)
|
||||
print()
|
||||
|
||||
|
@ -552,7 +647,7 @@ for topic_idx, top_docs in model.top_topic_docs(doc_topic_matrix, top_n=top_docu
|
|||
print()
|
||||
print()
|
||||
|
||||
|
||||
"""
|
||||
|
||||
|
||||
|
||||
|
@ -560,19 +655,20 @@ print()
|
|||
|
||||
|
||||
|
||||
jgibbsLLDA_root = "java_LabledLDA/"
|
||||
filepath = "{0}models/tickets/tickets.gz".format(jgibbsLLDA_root)
|
||||
jgibbsLLDA_root = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/java_LabledLDA/"
|
||||
LLDA_filepath = "{0}models/tickets/tickets.gz".format(jgibbsLLDA_root)
|
||||
|
||||
|
||||
#create file
|
||||
textacy.fileio.write_file_lines(generate_labled_lines(ticketcorpus),filepath=filepath)
|
||||
textacy.fileio.write_file_lines(generate_labled_lines(ticketcorpus), filepath=LLDA_filepath)
|
||||
|
||||
|
||||
# wait for file to exist
|
||||
while not os.path.exists(filepath):
|
||||
while not os.path.exists(LLDA_filepath):
|
||||
time.sleep(1)
|
||||
|
||||
print("\nstart LLDA:\n")
|
||||
print("\n\n")
|
||||
printlog("start LLDA:")
|
||||
#run JGibsslda file
|
||||
FNULL = open(os.devnull, 'w') # supress output
|
||||
subprocess.call(["java",
|
||||
|
@ -586,7 +682,7 @@ subprocess.call(["java",
|
|||
|
||||
# ANMERKUNG: Dateien sind versteckt. zu finden in models/
|
||||
|
||||
#print twords
|
||||
#twords
|
||||
subprocess.call(["gzip",
|
||||
"-dc",
|
||||
"{0}/models/tickets/.twords.gz".format(jgibbsLLDA_root)])
|
||||
|
@ -594,7 +690,7 @@ subprocess.call(["gzip",
|
|||
print()
|
||||
print()
|
||||
|
||||
|
||||
"""
|
||||
|
||||
|
||||
|
||||
|
@ -602,10 +698,4 @@ print()
|
|||
|
||||
|
||||
end = time.time()
|
||||
print("\n\n\nTime Elapsed Topic Modeling:{0}\n\n".format(end - start))
|
||||
|
||||
|
||||
|
||||
sys.stdout = old_stdout
|
||||
|
||||
log_file.close()
|
||||
printlog("\n\n\nTime Elapsed Topic Modeling:{0}\n\n".format(end - start))
|
Loading…
Reference in New Issue