last commit 11.9.17

This commit is contained in:
jannis.grundmann 2017-09-11 13:24:20 +02:00
parent 67e6f8845c
commit 0319e415a3
2 changed files with 40 additions and 15 deletions

View File

@ -7,14 +7,27 @@ language = de
[preprocessing]
ents = WORK_OF_ART,ORG,PRODUCT,LOC
ents2keep = WORK_OF_ART,ORG,PRODUCT,LOC
custom_words = grüßen,fragen
#lemmatize = True
default_return_first_Syn = False
[topic modeling]
ngrams = (1,2)
min_df = 0
max_df = 1.0
no_below = 20
no_above = 0.5
topicModel = lda
top_topic_words = 5
top_document_labels_per_topic = 2

View File

@ -6,7 +6,7 @@ import re
import subprocess
import time
import xml.etree.ElementTree as ET
import sys
import spacy
import textacy
from scipy import *
@ -113,9 +113,12 @@ def keepOnlyENT(ent_list,parser=PARSER):
def removeAllENT(ent_list, parser=PARSER):
return lambda doc: parser(" ".join([tok.text for tok in doc if tok.ent_type_ not in ent_list]))
def keepUniqueTokens(parser=PARSER):
return lambda doc: parser(" ".join(set([tok.text for tok in doc])))
def lemmatize(parser=PARSER):
return lambda doc: parser(" ".join([tok.lemma_ for tok in doc]))
doc2Set = lambda doc: str(set([tok.text for tok in doc]))
doc2String = lambda doc : doc.text
@ -215,21 +218,24 @@ def generate_labled_lines(textacyCorpus):
ents = config.get("preprocessing","ents").split(",")
ents = config.get("preprocessing","ents2keep").split(",")
clean_in_content=compose(
clean_in_content=compose( #anmrk.: unterste-funktion iwrd zuerst ausgeführt
doc2String,
#normalizeSynonyms(default_return_first_Syn=config.get("preprocessing","default_return_first_Syn")),
keepUniqueTokens(),
#normalizeSynonyms(default_return_first_Syn=False),
lemmatize(),
replaceEmails(),
replaceURLs(),
replaceTwitterMentions(),
removeWords(stop_words),
#removeAllPOS(["SPACE","PUNCT"]),
#removeAllENT(ents),
keepOnlyPOS(['NOUN'])
)
clean_in_meta = {
"Loesung":removeAllPOS(["SPACE"]),
"Zusammenfassung":removeAllPOS(["SPACE","PUNCT"])
@ -238,9 +244,6 @@ clean_in_meta = {
## add files to textacy-corpus,
print("add texts to textacy-corpus...")
corpus.add_texts(
@ -250,9 +253,18 @@ corpus.add_texts(
printRandomDoc(corpus)
#idee 3 versch. Corpi
####################'####################' Variablen todo alles in config
####################'####################'
####################'####################' todo alles in config
ngrams = (1,2)
@ -268,7 +280,7 @@ weighting = ('tf' if topicModel == 'lda' else 'tfidf')
top_topic_words = 5
top_document_labels_per_topic = 2
n_topics = 4
n_topics = len(set(corpus[0].metadata.keys()))+1 #+1 wegen einem default-topic