Improve tweet url and hashtag parsing
This commit is contained in:
parent
50218bcc4d
commit
3b5b4b7682
|
@ -1,4 +1,4 @@
|
||||||
import strutils, times, macros, htmlgen, unicode, options
|
import strutils, times, macros, htmlgen, unicode, options, algorithm
|
||||||
import regex, packedjson
|
import regex, packedjson
|
||||||
import types, utils, formatters
|
import types, utils, formatters
|
||||||
|
|
||||||
|
@ -6,9 +6,18 @@ const
|
||||||
unRegex = re"(^|[^A-z0-9-_./?])@([A-z0-9_]{1,15})"
|
unRegex = re"(^|[^A-z0-9-_./?])@([A-z0-9_]{1,15})"
|
||||||
unReplace = "$1<a href=\"/$2\">@$2</a>"
|
unReplace = "$1<a href=\"/$2\">@$2</a>"
|
||||||
|
|
||||||
htRegex = re"(^|[^\w-_./?])([#$])([\w_]+)"
|
htRegex = re"(^|[^\w-_./?])([##$])([\w_]+)"
|
||||||
htReplace = "$1<a href=\"/search?q=%23$3\">$2$3</a>"
|
htReplace = "$1<a href=\"/search?q=%23$3\">$2$3</a>"
|
||||||
|
|
||||||
|
type
|
||||||
|
ReplaceSliceKind = enum
|
||||||
|
rkRemove, rkUrl, rkHashtag, rkMention
|
||||||
|
|
||||||
|
ReplaceSlice = object
|
||||||
|
slice: Slice[int]
|
||||||
|
kind: ReplaceSliceKind
|
||||||
|
url, display: string
|
||||||
|
|
||||||
template isNull*(js: JsonNode): bool = js.kind == JNull
|
template isNull*(js: JsonNode): bool = js.kind == JNull
|
||||||
template notNull*(js: JsonNode): bool = js.kind != JNull
|
template notNull*(js: JsonNode): bool = js.kind != JNull
|
||||||
|
|
||||||
|
@ -124,65 +133,97 @@ proc getTombstone*(js: JsonNode): string =
|
||||||
result = js{"tombstoneInfo", "richText", "text"}.getStr
|
result = js{"tombstoneInfo", "richText", "text"}.getStr
|
||||||
result.removeSuffix(" Learn more")
|
result.removeSuffix(" Learn more")
|
||||||
|
|
||||||
template getSlice(text: string; slice: seq[int]): string =
|
proc extractSlice(js: JsonNode): Slice[int] =
|
||||||
text.runeSubStr(slice[0], slice[1] - slice[0])
|
result = js["indices"][0].getInt ..< js["indices"][1].getInt
|
||||||
|
|
||||||
proc getSlice(text: string; js: JsonNode): string =
|
|
||||||
if js.kind != JArray or js.len < 2 or js[0].kind != JInt: return text
|
|
||||||
|
|
||||||
let slice = @[js{0}.getInt, js{1}.getInt]
|
|
||||||
text.getSlice(slice)
|
|
||||||
|
|
||||||
proc expandUrl(text: var string; js: JsonNode; tLen: int; hideTwitter=false) =
|
|
||||||
let u = js{"url"}.getStr
|
|
||||||
if u.len == 0 or u notin text:
|
|
||||||
return
|
|
||||||
|
|
||||||
|
proc extractUrls(result: var seq[ReplaceSlice]; js: JsonNode;
|
||||||
|
textLen: int; hideTwitter = false) =
|
||||||
let
|
let
|
||||||
url = js{"expanded_url"}.getStr
|
url = js["expanded_url"].getStr
|
||||||
slice = js{"indices"}[1].getInt
|
slice = js.extractSlice
|
||||||
|
|
||||||
if hideTwitter and slice >= tLen and url.isTwitterUrl:
|
if hideTwitter and slice.b.succ >= textLen and url.isTwitterUrl:
|
||||||
text = text.replace(u, "")
|
if slice.a < textLen:
|
||||||
text.removeSuffix(' ')
|
result.add ReplaceSlice(kind: rkRemove, slice: slice)
|
||||||
text.removeSuffix('\n')
|
|
||||||
else:
|
else:
|
||||||
text = text.replace(u, a(shortLink(url), href=url))
|
result.add ReplaceSlice(kind: rkUrl, url: url,
|
||||||
|
display: url.shortLink, slice: slice)
|
||||||
|
|
||||||
proc expandMention(text: var string; orig: string; js: JsonNode) =
|
proc extractHashtags(result: var seq[ReplaceSlice]; js: JsonNode) =
|
||||||
|
result.add ReplaceSlice(kind: rkHashtag, slice: js.extractSlice)
|
||||||
|
|
||||||
|
proc replacedWith(runes: seq[Rune]; repls: openArray[ReplaceSlice];
|
||||||
|
textSlice: Slice[int]): string =
|
||||||
|
template extractLowerBound(i: int; idx): int =
|
||||||
|
if i > 0: repls[idx].slice.b.succ else: textSlice.a
|
||||||
|
|
||||||
|
result = newStringOfCap(runes.len)
|
||||||
|
|
||||||
|
var upperBound = textSlice.b
|
||||||
|
for i, rep in repls:
|
||||||
|
result.add $runes[extractLowerBound(i, i - 1) ..< rep.slice.a]
|
||||||
|
case rep.kind
|
||||||
|
of rkHashtag:
|
||||||
let
|
let
|
||||||
name = js{"name"}.getStr
|
name = $runes[rep.slice.a.succ .. rep.slice.b]
|
||||||
href = '/' & js{"screen_name"}.getStr
|
symbol = $runes[rep.slice.a]
|
||||||
uname = orig.getSlice(js{"indices"})
|
result.add a(symbol & name, href = "/search?q=%23" & name)
|
||||||
text = text.replace(uname, a(uname, href=href, title=name))
|
of rkMention:
|
||||||
|
result.add a($runes[rep.slice], href = rep.url, title = rep.display)
|
||||||
|
of rkUrl:
|
||||||
|
result.add a(rep.display, href = rep.url)
|
||||||
|
of rkRemove:
|
||||||
|
discard
|
||||||
|
|
||||||
|
let rest = extractLowerBound(repls.len, ^1) ..< textSlice.b
|
||||||
|
if rest.a <= rest.b:
|
||||||
|
result.add $runes[rest]
|
||||||
|
|
||||||
|
proc deduplicate(s: var seq[ReplaceSlice]) =
|
||||||
|
var
|
||||||
|
len = s.len
|
||||||
|
i = 0
|
||||||
|
while i < len:
|
||||||
|
var j = i + 1
|
||||||
|
while j < len:
|
||||||
|
if s[i].slice.a == s[j].slice.a:
|
||||||
|
s.del j
|
||||||
|
dec len
|
||||||
|
else:
|
||||||
|
inc j
|
||||||
|
inc i
|
||||||
|
|
||||||
|
proc cmp(x, y: ReplaceSlice): int = cmp(x.slice.a, y.slice.b)
|
||||||
|
|
||||||
proc expandProfileEntities*(profile: var Profile; js: JsonNode) =
|
proc expandProfileEntities*(profile: var Profile; js: JsonNode) =
|
||||||
let
|
let
|
||||||
orig = profile.bio
|
orig = profile.bio.toRunes
|
||||||
ent = ? js{"entities"}
|
ent = ? js{"entities"}
|
||||||
|
|
||||||
with urls, ent{"url", "urls"}:
|
with urls, ent{"url", "urls"}:
|
||||||
profile.website = urls[0]{"expanded_url"}.getStr
|
profile.website = urls[0]{"expanded_url"}.getStr
|
||||||
|
|
||||||
with urls, ent{"description", "urls"}:
|
var replacements = newSeq[ReplaceSlice]()
|
||||||
for u in urls: profile.bio.expandUrl(u, orig.high)
|
|
||||||
|
|
||||||
|
with urls, ent{"description", "urls"}:
|
||||||
|
for u in urls:
|
||||||
|
replacements.extractUrls(u, orig.high)
|
||||||
|
|
||||||
|
replacements.deduplicate
|
||||||
|
replacements.sort(cmp)
|
||||||
|
|
||||||
|
profile.bio = orig.replacedWith(replacements, 0 .. orig.len)
|
||||||
profile.bio = profile.bio.replace(unRegex, unReplace)
|
profile.bio = profile.bio.replace(unRegex, unReplace)
|
||||||
.replace(htRegex, htReplace)
|
.replace(htRegex, htReplace)
|
||||||
|
|
||||||
for mention in ? ent{"user_mentions"}:
|
|
||||||
profile.bio.expandMention(orig, mention)
|
|
||||||
|
|
||||||
proc expandTweetEntities*(tweet: Tweet; js: JsonNode) =
|
proc expandTweetEntities*(tweet: Tweet; js: JsonNode) =
|
||||||
let
|
let
|
||||||
orig = tweet.text
|
orig = tweet.text.toRunes
|
||||||
textRange = js{"display_text_range"}
|
textRange = js{"display_text_range"}
|
||||||
slice = @[textRange{0}.getInt, textRange{1}.getInt]
|
textSlice = textRange{0}.getInt .. textRange{1}.getInt
|
||||||
hasQuote = js{"is_quote_status"}.getBool
|
hasQuote = js{"is_quote_status"}.getBool
|
||||||
hasCard = tweet.card.isSome
|
hasCard = tweet.card.isSome
|
||||||
|
|
||||||
tweet.text = tweet.text.getSlice(slice)
|
|
||||||
|
|
||||||
var replyTo = ""
|
var replyTo = ""
|
||||||
if tweet.replyId != 0:
|
if tweet.replyId != 0:
|
||||||
with reply, js{"in_reply_to_screen_name"}:
|
with reply, js{"in_reply_to_screen_name"}:
|
||||||
|
@ -191,26 +232,45 @@ proc expandTweetEntities*(tweet: Tweet; js: JsonNode) =
|
||||||
|
|
||||||
let ent = ? js{"entities"}
|
let ent = ? js{"entities"}
|
||||||
|
|
||||||
|
var replacements = newSeq[ReplaceSlice]()
|
||||||
|
|
||||||
with urls, ent{"urls"}:
|
with urls, ent{"urls"}:
|
||||||
for u in urls:
|
for u in urls:
|
||||||
tweet.text.expandUrl(u, slice[1], hasQuote)
|
let urlStr = u["url"].getStr
|
||||||
|
if urlStr.len == 0 or urlStr notin tweet.text:
|
||||||
|
continue
|
||||||
|
replacements.extractUrls(u, textSlice.b, hideTwitter = hasQuote)
|
||||||
if hasCard and u{"url"}.getStr == get(tweet.card).url:
|
if hasCard and u{"url"}.getStr == get(tweet.card).url:
|
||||||
get(tweet.card).url = u{"expanded_url"}.getStr
|
get(tweet.card).url = u{"expanded_url"}.getStr
|
||||||
|
|
||||||
with media, ent{"media"}:
|
with media, ent{"media"}:
|
||||||
for m in media: tweet.text.expandUrl(m, slice[1], hideTwitter=true)
|
for m in media:
|
||||||
|
replacements.extractUrls(m, textSlice.b, hideTwitter = true)
|
||||||
|
|
||||||
if "hashtags" in ent or "symbols" in ent:
|
if "hashtags" in ent:
|
||||||
tweet.text = tweet.text.replace(htRegex, htReplace)
|
for hashtag in ent["hashtags"]:
|
||||||
|
replacements.extractHashtags(hashtag)
|
||||||
|
|
||||||
for mention in ? ent{"user_mentions"}:
|
if "symbols" in ent:
|
||||||
|
for symbol in ent["symbols"]:
|
||||||
|
replacements.extractHashtags(symbol)
|
||||||
|
|
||||||
|
if "user_mentions" in ent:
|
||||||
|
for mention in ent["user_mentions"]:
|
||||||
let
|
let
|
||||||
name = mention{"screen_name"}.getStr
|
name = mention{"screen_name"}.getStr
|
||||||
|
slice = mention.extractSlice
|
||||||
idx = tweet.reply.find(name)
|
idx = tweet.reply.find(name)
|
||||||
|
|
||||||
if mention{"indices"}[0].getInt >= slice[0]:
|
if slice.a >= textSlice.a:
|
||||||
tweet.text.expandMention(orig, mention)
|
replacements.add ReplaceSlice(kind: rkMention, slice: slice,
|
||||||
|
url: "/" & name, display: mention["name"].getStr)
|
||||||
if idx > -1 and name != replyTo:
|
if idx > -1 and name != replyTo:
|
||||||
tweet.reply.delete idx
|
tweet.reply.delete idx
|
||||||
elif idx == -1 and tweet.replyId != 0:
|
elif idx == -1 and tweet.replyId != 0:
|
||||||
tweet.reply.add name
|
tweet.reply.add name
|
||||||
|
|
||||||
|
replacements.deduplicate
|
||||||
|
replacements.sort(cmp)
|
||||||
|
|
||||||
|
tweet.text = orig.replacedWith(replacements, textSlice)
|
||||||
|
|
Loading…
Reference in New Issue