nitter/src/parserutils.nim

268 lines
8.2 KiB
Nim
Raw Normal View History

import xmltree, strtabs, strformat, strutils, times, uri
2019-06-26 16:51:21 +00:00
import regex
2019-06-23 23:34:30 +00:00
2019-08-11 19:26:37 +00:00
import types, formatters
2019-06-23 23:34:30 +00:00
2019-06-26 16:51:21 +00:00
from q import nil
from htmlgen import a
2019-06-26 16:51:21 +00:00
2019-06-23 23:34:30 +00:00
const
thumbRegex = re".+:url\('([^']+)'\)"
2019-07-03 08:00:27 +00:00
gifRegex = re".+thumb/([^\.']+)\.[jpng].*"
2019-06-23 23:34:30 +00:00
2019-06-26 16:51:21 +00:00
proc selectAll*(node: XmlNode; selector: string): seq[XmlNode] =
2019-06-27 19:07:29 +00:00
if node == nil: return
2019-06-26 16:51:21 +00:00
q.select(node, selector)
proc select*(node: XmlNode; selector: string): XmlNode =
2019-06-27 19:07:29 +00:00
if node == nil: return
2019-06-26 16:51:21 +00:00
let nodes = node.selectAll(selector)
if nodes.len > 0: nodes[0] else: nil
2019-06-27 19:07:29 +00:00
proc selectAttr*(node: XmlNode; selector: string; attr: string): string =
2019-06-26 16:51:21 +00:00
let res = node.select(selector)
2019-06-27 19:07:29 +00:00
if res == nil: "" else: res.attr(attr)
2019-06-23 23:34:30 +00:00
proc selectText*(node: XmlNode; selector: string): string =
2019-06-26 16:51:21 +00:00
let res = node.select(selector)
2019-06-23 23:34:30 +00:00
result = if res == nil: "" else: res.innerText()
2019-06-24 07:30:34 +00:00
proc getHeader(profile: XmlNode): XmlNode =
2019-06-26 16:51:21 +00:00
result = profile.select(".permalink-header")
2019-06-27 19:07:29 +00:00
if result == nil:
2019-06-26 16:51:21 +00:00
result = profile.select(".stream-item-header")
2019-06-27 19:07:29 +00:00
if result == nil:
2019-06-26 16:51:21 +00:00
result = profile.select(".ProfileCard-userFields")
if result == nil:
result = profile
2019-06-24 07:30:34 +00:00
2019-06-23 23:34:30 +00:00
proc isVerified*(profile: XmlNode): bool =
2019-06-29 04:31:02 +00:00
getHeader(profile).select(".Icon.Icon--verified") != nil
2019-06-23 23:34:30 +00:00
proc isProtected*(profile: XmlNode): bool =
2019-06-29 04:31:02 +00:00
getHeader(profile).select(".Icon.Icon--protected") != nil
2019-06-23 23:34:30 +00:00
proc parseText*(text: XmlNode; skipLink=""): string =
2019-10-17 23:00:35 +00:00
if text == nil: return
for el in text:
case el.kind
of xnText:
result.add el
of xnElement:
if el.attrs == nil:
if el.tag == "strong":
result.add $el
continue
let class = el.attr("class")
if "data-expanded-url" in el.attrs:
let url = el.attr("data-expanded-url")
if url == skipLink: continue
if "u-hidden" in class and result.len > 0:
result.add "\n"
result.add a(shortLink(url), href=url)
elif "ashtag" in class or "hashflag" in class:
let hash = el.innerText()
result.add a(hash, href=("/search?q=" & encodeUrl(hash)))
elif "atreply" in class:
result.add a(el.innerText(), href=el.attr("href"))
elif "Emoji" in class:
result.add el.attr("alt")
else: discard
2019-06-25 00:38:18 +00:00
2019-06-25 02:52:38 +00:00
proc getQuoteText*(tweet: XmlNode): string =
parseText(tweet.select(".QuoteTweet-text"))
2019-06-25 02:52:38 +00:00
2019-06-23 23:34:30 +00:00
proc getTweetText*(tweet: XmlNode): string =
2019-06-24 07:39:45 +00:00
let
2019-06-26 16:51:21 +00:00
quote = tweet.select(".QuoteTweet")
text = tweet.select(".tweet-text")
link = text.selectAttr("a.twitter-timeline-link.u-hidden", "data-expanded-url")
parseText(text, if quote != nil: link else: "")
2019-06-23 23:34:30 +00:00
proc getTimestamp*(tweet: XmlNode): Time =
2019-10-19 05:14:29 +00:00
let time = tweet.selectAttr(".js-short-timestamp", "data-time")
2019-06-27 19:07:29 +00:00
fromUnix(if time.len > 0: parseInt(time) else: 0)
2019-06-23 23:34:30 +00:00
proc getShortTime*(tweet: XmlNode): string =
2019-10-19 05:14:29 +00:00
tweet.selectText(".js-short-timestamp")
2019-06-23 23:34:30 +00:00
proc getDate*(node: XmlNode; selector: string): Time =
let date = node.select(selector)
if date == nil: return
parseTime(date.attr("title"), "h:mm tt - d MMM YYYY", utc())
proc getName*(profile: XmlNode; selector: string): string =
profile.selectText(selector).stripText()
proc getUsername*(profile: XmlNode; selector: string): string =
profile.selectText(selector).strip(chars={'@', ' ', '\n'})
2019-09-13 20:24:58 +00:00
proc getBio*(profile: XmlNode; selector: string; fallback=""): string =
var bio = profile.select(selector)
if bio == nil and fallback.len > 0:
bio = profile.select(fallback)
parseText(bio)
2019-06-23 23:34:30 +00:00
proc getLocation*(profile: XmlNode): string =
let sel = ".ProfileHeaderCard-locationText"
result = profile.selectText(sel).stripText()
let link = profile.selectAttr(sel & " a", "data-place-id")
if link.len > 0:
result &= ":" & link
2019-06-23 23:34:30 +00:00
proc getAvatar*(profile: XmlNode; selector: string): string =
profile.selectAttr(selector, "src").getUserpic()
proc getBanner*(node: XmlNode): string =
let url = node.selectAttr("svg > image", "xlink:href")
2019-06-23 23:34:30 +00:00
if url.len > 0:
result = url.replace("600x200", "1500x500")
else:
result = node.selectAttr(".ProfileCard-bg", "style")
result = result.replace("background-color: ", "")
2019-06-23 23:34:30 +00:00
if result.len == 0:
result = "#161616"
2019-06-23 23:34:30 +00:00
proc getTimelineBanner*(node: XmlNode): string =
let banner = node.select(".ProfileCanopy-headerBg img")
let img = banner.attr("src")
if img.len > 0:
return img
let style = node.select("style").innerText()
var m: RegexMatch
if style.find(re"a:active \{\n +color: (#[A-Z0-9]+)", m):
return style[m.group(0)[0]]
2019-08-11 21:24:02 +00:00
proc getMediaCount*(node: XmlNode): string =
let text = node.selectText(".PhotoRail-headingWithCount")
return text.stripText().split(" ")[0]
proc getProfileStats*(profile: var Profile; node: XmlNode) =
for s in node.selectAll( ".ProfileNav-stat"):
let text = s.attr("title").split(" ")[0]
case s.attr("data-nav")
of "followers": profile.followers = text
of "following": profile.following = text
of "favorites": profile.likes = text
of "tweets": profile.tweets = text
2019-06-23 23:34:30 +00:00
proc getPopupStats*(profile: var Profile; node: XmlNode) =
2019-06-26 16:51:21 +00:00
for s in node.selectAll( ".ProfileCardStats-statLink"):
2019-06-27 19:07:29 +00:00
let text = s.attr("title").split(" ")[0]
case s.attr("href").split("/")[^1]
2019-06-23 23:34:30 +00:00
of "followers": profile.followers = text
of "following": profile.following = text
else: profile.tweets = text
proc getIntentStats*(profile: var Profile; node: XmlNode) =
profile.tweets = "?"
2019-06-26 16:51:21 +00:00
for s in node.selectAll( "dd.count > a"):
2019-06-23 23:34:30 +00:00
let text = s.innerText()
2019-06-27 19:07:29 +00:00
case s.attr("href").split("/")[^1]
2019-06-23 23:34:30 +00:00
of "followers": profile.followers = text
of "following": profile.following = text
2019-07-01 21:48:25 +00:00
proc parseTweetStats*(node: XmlNode): TweetStats =
result = TweetStats(replies: "0", retweets: "0", likes: "0")
2019-06-26 16:51:21 +00:00
for action in node.selectAll(".ProfileTweet-actionCountForAria"):
2019-06-23 23:34:30 +00:00
let text = action.innerText.split()
2019-06-26 18:06:20 +00:00
case text[1][0 .. 2]
2019-07-01 21:48:25 +00:00
of "ret": result.retweets = text[0]
of "rep": result.replies = text[0]
of "lik": result.likes = text[0]
2019-06-23 23:34:30 +00:00
proc parseTweetReply*(node: XmlNode): seq[string] =
let reply = node.select(".ReplyingToContextBelowAuthor")
if reply == nil: return
2019-07-01 23:38:14 +00:00
let selector = if "Quote" in node.attr("class"): "b"
else: "a b"
for username in reply.selectAll(selector):
result.add username.innerText()
2019-06-24 03:14:14 +00:00
proc getGif(player: XmlNode): Gif =
let
2019-06-27 19:07:29 +00:00
thumb = player.attr("style").replace(thumbRegex, "$1")
2019-06-24 03:14:14 +00:00
id = thumb.replace(gifRegex, "$1")
url = &"https://video.twimg.com/tweet_video/{id}.mp4"
2019-06-24 03:14:14 +00:00
Gif(url: url, thumb: thumb)
proc getTweetMedia*(tweet: Tweet; node: XmlNode) =
2019-06-26 16:51:21 +00:00
for photo in node.selectAll(".AdaptiveMedia-photoContainer"):
2019-06-23 23:34:30 +00:00
tweet.photos.add photo.attrs["data-image-url"]
2019-06-26 16:51:21 +00:00
let player = node.select(".PlayableMedia")
if player == nil: return
2019-06-23 23:34:30 +00:00
2019-06-27 19:07:29 +00:00
if "gif" in player.attr("class"):
2019-09-18 18:54:07 +00:00
tweet.gif = some getGif(player.select(".PlayableMedia-player"))
2019-06-27 19:07:29 +00:00
elif "video" in player.attr("class"):
2019-10-21 21:29:27 +00:00
let thumb = player.selectAttr(".PlayableMedia-player", "style").split("'")
if thumb.len > 1:
tweet.video = some Video(thumb: thumb[^2])
else:
tweet.video = some Video()
2019-06-24 06:07:36 +00:00
proc getQuoteMedia*(quote: var Quote; node: XmlNode) =
if node.select(".QuoteTweet--sensitive") != nil:
2019-06-25 00:58:33 +00:00
quote.sensitive = true
return
2019-06-26 16:51:21 +00:00
let media = node.select(".QuoteMedia")
2019-06-27 19:07:29 +00:00
if media != nil:
quote.thumb = media.selectAttr("img", "src")
2019-06-24 06:07:36 +00:00
2019-06-26 16:51:21 +00:00
let badge = node.select(".AdaptiveMedia-badgeText")
let gifBadge = node.select(".Icon--gifBadge")
2019-06-24 22:39:32 +00:00
2019-06-27 19:07:29 +00:00
if badge != nil:
quote.badge = badge.innerText()
2019-06-27 19:07:29 +00:00
elif gifBadge != nil:
quote.badge = "GIF"
2019-06-29 12:11:23 +00:00
2019-07-15 11:40:59 +00:00
proc getTweetCard*(tweet: Tweet; node: XmlNode) =
2019-06-29 12:11:23 +00:00
if node.attr("data-has-cards") == "false": return
2019-07-15 14:03:01 +00:00
var cardType = node.attr("data-card2-type")
if ":" in cardType:
cardType = cardType.split(":")[^1]
2019-07-11 17:22:23 +00:00
if "poll" in cardType:
2019-09-18 18:54:07 +00:00
tweet.poll = some Poll()
2019-07-11 17:22:23 +00:00
return
2019-10-09 14:30:38 +00:00
if "message_me" in cardType:
return
2019-07-15 14:03:01 +00:00
let cardDiv = node.select(".card2 > .js-macaw-cards-iframe-container")
2019-07-11 17:22:23 +00:00
if cardDiv == nil: return
var card = Card(
id: $tweet.id,
2019-07-11 17:22:23 +00:00
query: cardDiv.attr("data-src")
)
2019-07-15 14:03:01 +00:00
try:
card.kind = parseEnum[CardKind](cardType)
except ValueError:
card.kind = summary
2019-07-15 11:40:59 +00:00
let cardUrl = cardDiv.attr("data-card-url")
for n in node.selectAll(".tweet-text a"):
if n.attr("href") == cardUrl:
card.url = n.attr("data-expanded-url")
2019-07-11 17:22:23 +00:00
2019-09-18 18:54:07 +00:00
tweet.card = some card
2019-07-01 01:13:12 +00:00
proc getMoreReplies*(node: XmlNode): int =
let text = node.innerText().strip()
try:
result = parseInt(text.split(" ")[0])
except:
result = -1