fix: [bug] punctuation was not part of OOV and were not accounted

This commit is contained in:
Alexandre Dulaunoy 2020-10-09 07:25:26 +02:00
parent ef5011a64f
commit 193ad08144
Signed by: adulau
GPG key ID: 09E2CD4944E6CBCD

View file

@ -73,6 +73,11 @@ for token in doc:
redisdb.zincrby("noun", 1, token.text) redisdb.zincrby("noun", 1, token.text)
redisdb.hincrby("stats", "noun", 1) redisdb.hincrby("stats", "noun", 1)
continue continue
if token.pos_ == "PUNCT" and not token.is_oov:
redisdb.zincrby("punct", 1, value)
redisdb.hincrby("stats", "punct", 1)
continue
if token.is_oov: if token.is_oov:
value = "{}".format(token) value = "{}".format(token)
@ -99,11 +104,6 @@ for token in doc:
redisdb.zincrby("email", 1, value) redisdb.zincrby("email", 1, value)
redisdb.hincrby("stats", "email", 1) redisdb.hincrby("stats", "email", 1)
continue continue
if token.is_punct:
redisdb.zincrby("punct", 1, value)
redisdb.hincrby("stats", "punct", 1)
continue
redisdb.zincrby("oov", 1, value) redisdb.zincrby("oov", 1, value)
redisdb.hincrby("stats", "oov", 1) redisdb.hincrby("stats", "oov", 1)