last commit 11.9.17

This commit is contained in:
jannis.grundmann 2017-09-11 13:24:20 +02:00
parent 67e6f8845c
commit 0319e415a3
2 changed files with 40 additions and 15 deletions

View File

@ -7,14 +7,27 @@ language = de
[preprocessing] [preprocessing]
ents = WORK_OF_ART,ORG,PRODUCT,LOC ents2keep = WORK_OF_ART,ORG,PRODUCT,LOC
custom_words = grüßen,fragen custom_words = grüßen,fragen
#lemmatize = True #lemmatize = True
default_return_first_Syn = False
[topic modeling]
ngrams = (1,2)
min_df = 0
max_df = 1.0
no_below = 20
no_above = 0.5
topicModel = lda
top_topic_words = 5
top_document_labels_per_topic = 2

View File

@ -6,7 +6,7 @@ import re
import subprocess import subprocess
import time import time
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
import sys
import spacy import spacy
import textacy import textacy
from scipy import * from scipy import *
@ -113,9 +113,12 @@ def keepOnlyENT(ent_list,parser=PARSER):
def removeAllENT(ent_list, parser=PARSER): def removeAllENT(ent_list, parser=PARSER):
return lambda doc: parser(" ".join([tok.text for tok in doc if tok.ent_type_ not in ent_list])) return lambda doc: parser(" ".join([tok.text for tok in doc if tok.ent_type_ not in ent_list]))
def keepUniqueTokens(parser=PARSER):
return lambda doc: parser(" ".join(set([tok.text for tok in doc])))
def lemmatize(parser=PARSER):
return lambda doc: parser(" ".join([tok.lemma_ for tok in doc]))
doc2Set = lambda doc: str(set([tok.text for tok in doc]))
doc2String = lambda doc : doc.text doc2String = lambda doc : doc.text
@ -215,21 +218,24 @@ def generate_labled_lines(textacyCorpus):
ents = config.get("preprocessing","ents").split(",") ents = config.get("preprocessing","ents2keep").split(",")
clean_in_content=compose(
clean_in_content=compose( #anmrk.: unterste-funktion iwrd zuerst ausgeführt
doc2String, doc2String,
#normalizeSynonyms(default_return_first_Syn=config.get("preprocessing","default_return_first_Syn")), keepUniqueTokens(),
#normalizeSynonyms(default_return_first_Syn=False),
lemmatize(),
replaceEmails(), replaceEmails(),
replaceURLs(), replaceURLs(),
replaceTwitterMentions(), replaceTwitterMentions(),
removeWords(stop_words),
#removeAllPOS(["SPACE","PUNCT"]),
#removeAllENT(ents), #removeAllENT(ents),
keepOnlyPOS(['NOUN']) keepOnlyPOS(['NOUN'])
) )
clean_in_meta = { clean_in_meta = {
"Loesung":removeAllPOS(["SPACE"]), "Loesung":removeAllPOS(["SPACE"]),
"Zusammenfassung":removeAllPOS(["SPACE","PUNCT"]) "Zusammenfassung":removeAllPOS(["SPACE","PUNCT"])
@ -238,9 +244,6 @@ clean_in_meta = {
## add files to textacy-corpus, ## add files to textacy-corpus,
print("add texts to textacy-corpus...") print("add texts to textacy-corpus...")
corpus.add_texts( corpus.add_texts(
@ -250,9 +253,18 @@ corpus.add_texts(
printRandomDoc(corpus) printRandomDoc(corpus)
#idee 3 versch. Corpi
####################'####################' Variablen todo alles in config
####################'####################'
####################'####################' todo alles in config
ngrams = (1,2) ngrams = (1,2)
@ -268,7 +280,7 @@ weighting = ('tf' if topicModel == 'lda' else 'tfidf')
top_topic_words = 5 top_topic_words = 5
top_document_labels_per_topic = 2 top_document_labels_per_topic = 2
n_topics = 4 n_topics = len(set(corpus[0].metadata.keys()))+1 #+1 wegen einem default-topic