Deduplicate note tweet parsing

This commit is contained in:
Zed 2023-03-03 21:19:21 +01:00
parent 368974c803
commit aea884c48e
3 changed files with 35 additions and 65 deletions

View File

@ -204,12 +204,12 @@ proc parseTweet(js: JsonNode; jsCard: JsonNode = newJNull()): Tweet =
) )
) )
result.expandTweetEntities(js)
# fix for pinned threads # fix for pinned threads
if result.hasThread and result.threadId == 0: if result.hasThread and result.threadId == 0:
result.threadId = js{"self_thread", "id_str"}.getId result.threadId = js{"self_thread", "id_str"}.getId
result.expandTweetEntities(js)
if js{"is_quote_status"}.getBool: if js{"is_quote_status"}.getBool:
result.quote = some Tweet(id: js{"quoted_status_id_str"}.getId) result.quote = some Tweet(id: js{"quoted_status_id_str"}.getId)

View File

@ -230,47 +230,37 @@ proc expandUserEntities*(user: var User; js: JsonNode) =
user.bio = user.bio.replacef(unRegex, unReplace) user.bio = user.bio.replacef(unRegex, unReplace)
.replacef(htRegex, htReplace) .replacef(htRegex, htReplace)
proc expandTweetEntities*(tweet: Tweet; js: JsonNode) = proc expandTextEntities(tweet: Tweet; entities: JsonNode; text: string; textSlice: Slice[int];
let replyTo=""; hasQuote=false) =
orig = tweet.text.toRunes let hasCard = tweet.card.isSome
textRange = js{"display_text_range"}
textSlice = textRange{0}.getInt .. textRange{1}.getInt
hasQuote = js{"is_quote_status"}.getBool
hasCard = tweet.card.isSome
var replyTo = ""
if tweet.replyId != 0:
with reply, js{"in_reply_to_screen_name"}:
tweet.reply.add reply.getStr
replyTo = reply.getStr
let ent = ? js{"entities"}
var replacements = newSeq[ReplaceSlice]() var replacements = newSeq[ReplaceSlice]()
with urls, ent{"urls"}: with urls, entities{"urls"}:
for u in urls: for u in urls:
let urlStr = u["url"].getStr let urlStr = u["url"].getStr
if urlStr.len == 0 or urlStr notin tweet.text: if urlStr.len == 0 or urlStr notin text:
continue continue
replacements.extractUrls(u, textSlice.b, hideTwitter = hasQuote) replacements.extractUrls(u, textSlice.b, hideTwitter = hasQuote)
if hasCard and u{"url"}.getStr == get(tweet.card).url: if hasCard and u{"url"}.getStr == get(tweet.card).url:
get(tweet.card).url = u{"expanded_url"}.getStr get(tweet.card).url = u{"expanded_url"}.getStr
with media, ent{"media"}: with media, entities{"media"}:
for m in media: for m in media:
replacements.extractUrls(m, textSlice.b, hideTwitter = true) replacements.extractUrls(m, textSlice.b, hideTwitter = true)
if "hashtags" in ent: if "hashtags" in entities:
for hashtag in ent["hashtags"]: for hashtag in entities["hashtags"]:
replacements.extractHashtags(hashtag) replacements.extractHashtags(hashtag)
if "symbols" in ent: if "symbols" in entities:
for symbol in ent["symbols"]: for symbol in entities["symbols"]:
replacements.extractHashtags(symbol) replacements.extractHashtags(symbol)
if "user_mentions" in ent: if "user_mentions" in entities:
for mention in ent["user_mentions"]: for mention in entities["user_mentions"]:
let let
name = mention{"screen_name"}.getStr name = mention{"screen_name"}.getStr
slice = mention.extractSlice slice = mention.extractSlice
@ -287,47 +277,27 @@ proc expandTweetEntities*(tweet: Tweet; js: JsonNode) =
replacements.deduplicate replacements.deduplicate
replacements.sort(cmp) replacements.sort(cmp)
tweet.text = orig.replacedWith(replacements, textSlice) tweet.text = text.toRunes.replacedWith(replacements, textSlice).strip(leading=false)
.strip(leading=false)
proc expandNoteTweetEntities*(tweet: Tweet; noteTweet: JsonNode) = proc expandTweetEntities*(tweet: Tweet; js: JsonNode) =
let let
text = noteTweet{"text"}.getStr entities = ? js{"entities"}
orig = text.toRunes hasQuote = js{"is_quote_status"}.getBool
ent = ? noteTweet{"entity_set"} textRange = js{"display_text_range"}
hasCard = tweet.card.isSome textSlice = textRange{0}.getInt .. textRange{1}.getInt
var replacements = newSeq[ReplaceSlice]() var replyTo = ""
if tweet.replyId != 0:
with reply, js{"in_reply_to_screen_name"}:
replyTo = reply.getStr
tweet.reply.add replyTo
with urls, ent{"urls"}: tweet.expandTextEntities(entities, tweet.text, textSlice, replyTo, hasQuote)
for u in urls:
let urlStr = u["url"].getStr
if urlStr.len == 0 or urlStr notin text:
continue
replacements.extractUrls(u, orig.len, hideTwitter = false)
if hasCard and u{"url"}.getStr == get(tweet.card).url:
get(tweet.card).url = u{"expanded_url"}.getStr
if "hashtags" in ent: proc expandNoteTweetEntities*(tweet: Tweet; js: JsonNode) =
for hashtag in ent["hashtags"]: let
replacements.extractHashtags(hashtag) entities = ? js{"entity_set"}
text = js{"text"}.getStr
textSlice = 0..text.runeLen
if "symbols" in ent: tweet.expandTextEntities(entities, text, textSlice)
for symbol in ent["symbols"]:
replacements.extractHashtags(symbol)
if "user_mentions" in ent:
for mention in ent["user_mentions"]:
let
name = mention{"screen_name"}.getStr
slice = mention.extractSlice
idx = tweet.reply.find(name)
replacements.add ReplaceSlice(kind: rkMention, slice: slice,
url: "/" & name, display: mention["name"].getStr)
replacements.deduplicate
replacements.sort(cmp)
tweet.text = orig.replacedWith(replacements, 0..orig.len)
.strip(leading=false)

View File

@ -42,7 +42,7 @@ no_thumb = [
['nim_lang/status/1082989146040340480', ['nim_lang/status/1082989146040340480',
'Nim in 2018: A short recap', 'Nim in 2018: A short recap',
'36 votes and 46 comments so far on Reddit', 'Posted by u/miran1 - 36 votes and 46 comments',
'reddit.com'] 'reddit.com']
] ]