mirror of
https://github.com/adulau/napkin-text-analysis.git
synced 2024-11-25 03:17:07 +00:00
chg: [feature] add punct statistics for the oov (but the punct in
spacy.io seems super buggy or incorrect)
This commit is contained in:
parent
526f88071c
commit
3c3760019e
1 changed files with 8 additions and 1 deletions
|
@ -36,7 +36,9 @@ with open(args.f, 'r') as file:
|
||||||
|
|
||||||
doc = nlp(text)
|
doc = nlp(text)
|
||||||
|
|
||||||
analysis = ["verb:napkin", "noun:napkin", "hashtag:napkin", "mention:napkin", "digit:napkin", "url:napking", "oov:napkin", "labels:napkin"]
|
analysis = ["verb:napkin", "noun:napkin", "hashtag:napkin", "mention:napkin",
|
||||||
|
"digit:napkin", "url:napking", "oov:napkin", "labels:napkin",
|
||||||
|
"punct:napkin"]
|
||||||
|
|
||||||
redisdb.hset("stats", "token", doc.__len__())
|
redisdb.hset("stats", "token", doc.__len__())
|
||||||
|
|
||||||
|
@ -75,6 +77,11 @@ for token in doc:
|
||||||
redisdb.zincrby("email:napkin", 1, value)
|
redisdb.zincrby("email:napkin", 1, value)
|
||||||
redisdb.hincrby("stats", "email:napkin", 1)
|
redisdb.hincrby("stats", "email:napkin", 1)
|
||||||
continue
|
continue
|
||||||
|
if token.is_punct:
|
||||||
|
redisdb.zincrby("punct:napkin", 1, value)
|
||||||
|
redisdb.hincrby("stats", "punct:napkin", 1)
|
||||||
|
continue
|
||||||
|
|
||||||
redisdb.zincrby("oov:napkin", 1, value)
|
redisdb.zincrby("oov:napkin", 1, value)
|
||||||
redisdb.hincrby("stats", "oov:napkin", 1)
|
redisdb.hincrby("stats", "oov:napkin", 1)
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue