From 0319e415a37a9a4328d421044cec63887bf12c24 Mon Sep 17 00:00:00 2001 From: "jannis.grundmann" Date: Mon, 11 Sep 2017 13:24:20 +0200 Subject: [PATCH] last commit 11.9.17 --- config.ini | 19 ++++++++++++++++--- preprocessing.py | 36 ++++++++++++++++++++++++------------ 2 files changed, 40 insertions(+), 15 deletions(-) diff --git a/config.ini b/config.ini index 5612339..a0fd830 100644 --- a/config.ini +++ b/config.ini @@ -7,14 +7,27 @@ language = de [preprocessing] -ents = WORK_OF_ART,ORG,PRODUCT,LOC +ents2keep = WORK_OF_ART,ORG,PRODUCT,LOC custom_words = grüßen,fragen #lemmatize = True -default_return_first_Syn = False - + +[topic modeling] + +ngrams = (1,2) + +min_df = 0 +max_df = 1.0 +no_below = 20 +no_above = 0.5 + +topicModel = lda + +top_topic_words = 5 + +top_document_labels_per_topic = 2 diff --git a/preprocessing.py b/preprocessing.py index 61c1709..7dda81c 100644 --- a/preprocessing.py +++ b/preprocessing.py @@ -6,7 +6,7 @@ import re import subprocess import time import xml.etree.ElementTree as ET - +import sys import spacy import textacy from scipy import * @@ -113,9 +113,12 @@ def keepOnlyENT(ent_list,parser=PARSER): def removeAllENT(ent_list, parser=PARSER): return lambda doc: parser(" ".join([tok.text for tok in doc if tok.ent_type_ not in ent_list])) +def keepUniqueTokens(parser=PARSER): + return lambda doc: parser(" ".join(set([tok.text for tok in doc]))) +def lemmatize(parser=PARSER): + return lambda doc: parser(" ".join([tok.lemma_ for tok in doc])) -doc2Set = lambda doc: str(set([tok.text for tok in doc])) doc2String = lambda doc : doc.text @@ -215,21 +218,24 @@ def generate_labled_lines(textacyCorpus): -ents = config.get("preprocessing","ents").split(",") +ents = config.get("preprocessing","ents2keep").split(",") -clean_in_content=compose( + +clean_in_content=compose( #anmrk.: unterste-funktion iwrd zuerst ausgeführt doc2String, - #normalizeSynonyms(default_return_first_Syn=config.get("preprocessing","default_return_first_Syn")), + keepUniqueTokens(), + #normalizeSynonyms(default_return_first_Syn=False), + lemmatize(), replaceEmails(), replaceURLs(), replaceTwitterMentions(), - removeWords(stop_words), - #removeAllPOS(["SPACE","PUNCT"]), #removeAllENT(ents), keepOnlyPOS(['NOUN']) ) + + clean_in_meta = { "Loesung":removeAllPOS(["SPACE"]), "Zusammenfassung":removeAllPOS(["SPACE","PUNCT"]) @@ -238,9 +244,6 @@ clean_in_meta = { - - - ## add files to textacy-corpus, print("add texts to textacy-corpus...") corpus.add_texts( @@ -250,9 +253,18 @@ corpus.add_texts( printRandomDoc(corpus) +#idee 3 versch. Corpi -####################'####################' Variablen todo alles in config + +####################'####################' + + + + + + +####################'####################' todo alles in config ngrams = (1,2) @@ -268,7 +280,7 @@ weighting = ('tf' if topicModel == 'lda' else 'tfidf') top_topic_words = 5 top_document_labels_per_topic = 2 -n_topics = 4 +n_topics = len(set(corpus[0].metadata.keys()))+1 #+1 wegen einem default-topic