From 0da076ddcf932e1f7f14a5c8c09cea5567c4c3a3 Mon Sep 17 00:00:00 2001 From: Zed Date: Mon, 15 Jul 2019 13:40:59 +0200 Subject: [PATCH] Fix card link parsing edge cases --- src/formatters.nim | 6 ------ src/parser.nim | 7 +++++-- src/parserutils.nim | 10 +++++----- 3 files changed, 10 insertions(+), 13 deletions(-) diff --git a/src/formatters.nim b/src/formatters.nim index a21f4c1..5d0de04 100644 --- a/src/formatters.nim +++ b/src/formatters.nim @@ -10,7 +10,6 @@ const emailRegex = re"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)" usernameRegex = re"(^|[^A-z0-9_?])@([A-z0-9_]+)" picRegex = re"pic.twitter.com/[^ ]+" - cardRegex = re"(https?://)?cards.twitter.com/[^ ]+" ellipsisRegex = re" ?…" nbsp = $Rune(0x000A0) @@ -60,7 +59,6 @@ proc linkifyText*(text: string): string = proc stripTwitterUrls*(text: string): string = result = text result = result.replace(picRegex, "") - result = result.replace(cardRegex, "") result = result.replace(ellipsisRegex, "") proc getUserpic*(userpic: string; style=""): string = @@ -81,7 +79,3 @@ proc getTime*(tweet: Tweet): string = proc getLink*(tweet: Tweet | Quote): string = &"/{tweet.profile.username}/status/{tweet.id}" - -proc getUrls*(text: string): seq[string] = - # temporary - text.findAll(urlRegex).mapIt(text[it.group(0)[0]]) diff --git a/src/parser.nim b/src/parser.nim index a1289ae..07dd458 100644 --- a/src/parser.nim +++ b/src/parser.nim @@ -75,7 +75,7 @@ proc parseTweet*(node: XmlNode): Tweet = ) result.getTweetMedia(tweet) - result.getTweetCards(tweet) + result.getTweetCard(tweet) let by = tweet.selectText(".js-retweet-text > a > b") if by.len > 0: @@ -197,7 +197,10 @@ proc parseCard*(card: var Card; node: XmlNode) = card.text = node.selectText("p.tcu-resetMargin") card.dest = node.selectText("span.SummaryCard-destination") - let image = node.select(".tcu-imageWrapper > img") + if card.url.len == 0: + card.url = node.select("a").attr("href") + + let image = node.select(".tcu-imageWrapper img") if image != nil: # workaround for issue 11713 card.image = image.attr("data-src").replace("gname", "g&name") diff --git a/src/parserutils.nim b/src/parserutils.nim index d5b1c4e..25e0b5d 100644 --- a/src/parserutils.nim +++ b/src/parserutils.nim @@ -167,7 +167,7 @@ proc getQuoteMedia*(quote: var Quote; node: XmlNode) = elif gifBadge != nil: quote.badge = "GIF" -proc getTweetCards*(tweet: Tweet; node: XmlNode) = +proc getTweetCard*(tweet: Tweet; node: XmlNode) = if node.attr("data-has-cards") == "false": return let cardType = node.attr("data-card2-type") @@ -183,10 +183,10 @@ proc getTweetCards*(tweet: Tweet; node: XmlNode) = query: cardDiv.attr("data-src") ) - # temporary solution - let text = node.selectText(".tweet-text") - let urls = getUrls(text) - card.url = urls[0] + let cardUrl = cardDiv.attr("data-card-url") + for n in node.selectAll(".tweet-text a"): + if n.attr("href") == cardUrl: + card.url = n.attr("data-expanded-url") tweet.card = some(card)