last commit 11.9.17
This commit is contained in:
parent
67e6f8845c
commit
0319e415a3
19
config.ini
19
config.ini
|
@ -7,14 +7,27 @@ language = de
|
|||
|
||||
[preprocessing]
|
||||
|
||||
ents = WORK_OF_ART,ORG,PRODUCT,LOC
|
||||
ents2keep = WORK_OF_ART,ORG,PRODUCT,LOC
|
||||
|
||||
custom_words = grüßen,fragen
|
||||
|
||||
#lemmatize = True
|
||||
|
||||
default_return_first_Syn = False
|
||||
|
||||
|
||||
[topic modeling]
|
||||
|
||||
ngrams = (1,2)
|
||||
|
||||
min_df = 0
|
||||
max_df = 1.0
|
||||
no_below = 20
|
||||
no_above = 0.5
|
||||
|
||||
topicModel = lda
|
||||
|
||||
top_topic_words = 5
|
||||
|
||||
top_document_labels_per_topic = 2
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -6,7 +6,7 @@ import re
|
|||
import subprocess
|
||||
import time
|
||||
import xml.etree.ElementTree as ET
|
||||
|
||||
import sys
|
||||
import spacy
|
||||
import textacy
|
||||
from scipy import *
|
||||
|
@ -113,9 +113,12 @@ def keepOnlyENT(ent_list,parser=PARSER):
|
|||
def removeAllENT(ent_list, parser=PARSER):
|
||||
return lambda doc: parser(" ".join([tok.text for tok in doc if tok.ent_type_ not in ent_list]))
|
||||
|
||||
def keepUniqueTokens(parser=PARSER):
|
||||
return lambda doc: parser(" ".join(set([tok.text for tok in doc])))
|
||||
|
||||
def lemmatize(parser=PARSER):
|
||||
return lambda doc: parser(" ".join([tok.lemma_ for tok in doc]))
|
||||
|
||||
doc2Set = lambda doc: str(set([tok.text for tok in doc]))
|
||||
doc2String = lambda doc : doc.text
|
||||
|
||||
|
||||
|
@ -215,21 +218,24 @@ def generate_labled_lines(textacyCorpus):
|
|||
|
||||
|
||||
|
||||
ents = config.get("preprocessing","ents").split(",")
|
||||
ents = config.get("preprocessing","ents2keep").split(",")
|
||||
|
||||
clean_in_content=compose(
|
||||
|
||||
clean_in_content=compose( #anmrk.: unterste-funktion iwrd zuerst ausgeführt
|
||||
|
||||
doc2String,
|
||||
#normalizeSynonyms(default_return_first_Syn=config.get("preprocessing","default_return_first_Syn")),
|
||||
keepUniqueTokens(),
|
||||
#normalizeSynonyms(default_return_first_Syn=False),
|
||||
lemmatize(),
|
||||
replaceEmails(),
|
||||
replaceURLs(),
|
||||
replaceTwitterMentions(),
|
||||
removeWords(stop_words),
|
||||
#removeAllPOS(["SPACE","PUNCT"]),
|
||||
#removeAllENT(ents),
|
||||
keepOnlyPOS(['NOUN'])
|
||||
)
|
||||
|
||||
|
||||
|
||||
clean_in_meta = {
|
||||
"Loesung":removeAllPOS(["SPACE"]),
|
||||
"Zusammenfassung":removeAllPOS(["SPACE","PUNCT"])
|
||||
|
@ -238,9 +244,6 @@ clean_in_meta = {
|
|||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
## add files to textacy-corpus,
|
||||
print("add texts to textacy-corpus...")
|
||||
corpus.add_texts(
|
||||
|
@ -250,9 +253,18 @@ corpus.add_texts(
|
|||
|
||||
printRandomDoc(corpus)
|
||||
|
||||
#idee 3 versch. Corpi
|
||||
|
||||
|
||||
####################'####################' Variablen todo alles in config
|
||||
|
||||
####################'####################'
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
####################'####################' todo alles in config
|
||||
|
||||
ngrams = (1,2)
|
||||
|
||||
|
@ -268,7 +280,7 @@ weighting = ('tf' if topicModel == 'lda' else 'tfidf')
|
|||
top_topic_words = 5
|
||||
top_document_labels_per_topic = 2
|
||||
|
||||
n_topics = 4
|
||||
n_topics = len(set(corpus[0].metadata.keys()))+1 #+1 wegen einem default-topic
|
||||
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue