last commit 11.9.17

2017-09-11 13:24:20 +02:00 · 2017-09-11 13:24:20 +02:00 · 0319e415a3
parent 67e6f8845c
commit 0319e415a3
2 changed files with 40 additions and 15 deletions
--- a/config.ini
+++ b/config.ini
@ -7,14 +7,27 @@ language = de

 [preprocessing]

-ents = WORK_OF_ART,ORG,PRODUCT,LOC
+ents2keep = WORK_OF_ART,ORG,PRODUCT,LOC

 custom_words = grüßen,fragen

 #lemmatize = True

-default_return_first_Syn = False
-
+
+[topic modeling]
+
+ngrams = (1,2)
+
+min_df = 0
+max_df = 1.0
+no_below = 20
+no_above = 0.5
+
+topicModel = lda
+
+top_topic_words = 5
+
+top_document_labels_per_topic = 2



--- a/preprocessing.py
+++ b/preprocessing.py
@ -6,7 +6,7 @@ import re
 import subprocess
 import time
 import xml.etree.ElementTree as ET
-
+import sys
 import spacy
 import textacy
 from scipy import *
@ -113,9 +113,12 @@ def keepOnlyENT(ent_list,parser=PARSER):
 def removeAllENT(ent_list, parser=PARSER):
    return lambda doc:  parser(" ".join([tok.text for tok in doc if tok.ent_type_ not in ent_list]))

+def keepUniqueTokens(parser=PARSER):
+    return  lambda doc: parser(" ".join(set([tok.text for tok in doc])))

+def lemmatize(parser=PARSER):
+    return lambda doc:  parser(" ".join([tok.lemma_ for tok in doc]))

-doc2Set = lambda doc: str(set([tok.text for tok in doc]))
 doc2String = lambda doc : doc.text


@ -215,21 +218,24 @@ def generate_labled_lines(textacyCorpus):



-ents = config.get("preprocessing","ents").split(",")
+ents = config.get("preprocessing","ents2keep").split(",")

-clean_in_content=compose(
+
+clean_in_content=compose(   #anmrk.: unterste-funktion iwrd zuerst ausgeführt

    doc2String,
-    #normalizeSynonyms(default_return_first_Syn=config.get("preprocessing","default_return_first_Syn")),
+    keepUniqueTokens(),
+    #normalizeSynonyms(default_return_first_Syn=False),
+    lemmatize(),
    replaceEmails(),
    replaceURLs(),
    replaceTwitterMentions(),
-    removeWords(stop_words),
-    #removeAllPOS(["SPACE","PUNCT"]),
    #removeAllENT(ents),
    keepOnlyPOS(['NOUN'])
 )

+
+
 clean_in_meta = {
    "Loesung":removeAllPOS(["SPACE"]),
    "Zusammenfassung":removeAllPOS(["SPACE","PUNCT"])
@ -238,9 +244,6 @@ clean_in_meta = {



-
-
-
 ## add files to textacy-corpus,
 print("add texts to textacy-corpus...")
 corpus.add_texts(
@ -250,9 +253,18 @@ corpus.add_texts(

 printRandomDoc(corpus)

+#idee 3 versch. Corpi


-####################'####################'      Variablen todo alles in config
+
+####################'####################'
+
+
+
+
+
+
+####################'####################'    todo alles in config

 ngrams = (1,2)

@ -268,7 +280,7 @@ weighting = ('tf' if topicModel == 'lda' else 'tfidf')
 top_topic_words = 5
 top_document_labels_per_topic = 2

-n_topics = 4
+n_topics = len(set(corpus[0].metadata.keys()))+1 #+1 wegen einem default-topic