Rewrite text parsing to ensure correctness
This commit is contained in:
		
							parent
							
								
									83a651e732
								
							
						
					
					
						commit
						7b766b793b
					
				| 
						 | 
				
			
			@ -6,12 +6,6 @@ import types, utils, query
 | 
			
		|||
from unicode import Rune, `$`
 | 
			
		||||
 | 
			
		||||
const
 | 
			
		||||
  urlRegex = re"((https?|ftp)://(-\.)?([^\s/?\.#]+\.?)+([/\?][^\s\)]*)?)"
 | 
			
		||||
  emailRegex = re"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)"
 | 
			
		||||
  usernameRegex = re"(^|[^A-z0-9_?\/])@([A-z0-9_]+)"
 | 
			
		||||
  picRegex = re"pic.twitter.com/[^ ]+"
 | 
			
		||||
  ellipsisRegex = re" ?…"
 | 
			
		||||
  hashtagRegex = re"([^\S]|^)([#$]\w+)"
 | 
			
		||||
  ytRegex = re"(www.|m.)?youtu(be.com|.be)"
 | 
			
		||||
  twRegex = re"(www.|mobile.)?twitter.com"
 | 
			
		||||
  nbsp = $Rune(0x000A0)
 | 
			
		||||
| 
						 | 
				
			
			@ -26,75 +20,14 @@ proc shortLink*(text: string; length=28): string =
 | 
			
		|||
  if result.len > length:
 | 
			
		||||
    result = result[0 ..< length] & "…"
 | 
			
		||||
 | 
			
		||||
proc toLink*(url, text: string): string =
 | 
			
		||||
  a(text, href=url)
 | 
			
		||||
 | 
			
		||||
proc reUrlToShortLink*(m: RegexMatch; s: string): string =
 | 
			
		||||
  let url = s[m.group(0)[0]]
 | 
			
		||||
  toLink(url, shortLink(url))
 | 
			
		||||
 | 
			
		||||
proc reUrlToLink*(m: RegexMatch; s: string): string =
 | 
			
		||||
  let url = s[m.group(0)[0]]
 | 
			
		||||
  toLink(url, url.replace(re"https?://(www.)?", ""))
 | 
			
		||||
 | 
			
		||||
proc reEmailToLink*(m: RegexMatch; s: string): string =
 | 
			
		||||
  let url = s[m.group(0)[0]]
 | 
			
		||||
  toLink("mailto://" & url, url)
 | 
			
		||||
 | 
			
		||||
proc reHashtagToLink*(m: RegexMatch; s: string): string =
 | 
			
		||||
  result = if m.group(0).len > 0: s[m.group(0)[0]] else: ""
 | 
			
		||||
  let hash = s[m.group(1)[0]]
 | 
			
		||||
  let link = toLink("/search?q=" & encodeUrl(hash), hash)
 | 
			
		||||
  if hash.any(isAlphaAscii):
 | 
			
		||||
    result &= link
 | 
			
		||||
  else:
 | 
			
		||||
    result &= hash
 | 
			
		||||
 | 
			
		||||
proc reUsernameToLink*(m: RegexMatch; s: string): string =
 | 
			
		||||
  var username = ""
 | 
			
		||||
  var pretext = ""
 | 
			
		||||
 | 
			
		||||
  let pre = m.group(0)
 | 
			
		||||
  let match = m.group(1)
 | 
			
		||||
 | 
			
		||||
  username = s[match[0]]
 | 
			
		||||
 | 
			
		||||
  if pre.len > 0:
 | 
			
		||||
    pretext = s[pre[0]]
 | 
			
		||||
 | 
			
		||||
  pretext & toLink("/" & username, "@" & username)
 | 
			
		||||
 | 
			
		||||
proc reUsernameToFullLink*(m: RegexMatch; s: string): string =
 | 
			
		||||
  result = reUsernameToLink(m, s)
 | 
			
		||||
  result = result.replace("href=\"/", &"href=\"https://{hostname}/")
 | 
			
		||||
 | 
			
		||||
proc replaceUrl*(url: string; prefs: Prefs): string =
 | 
			
		||||
proc replaceUrl*(url: string; prefs: Prefs; rss=false): string =
 | 
			
		||||
  result = url
 | 
			
		||||
  if prefs.replaceYouTube.len > 0:
 | 
			
		||||
    result = result.replace(ytRegex, prefs.replaceYouTube)
 | 
			
		||||
  if prefs.replaceTwitter.len > 0:
 | 
			
		||||
    result = result.replace(twRegex, prefs.replaceTwitter)
 | 
			
		||||
 | 
			
		||||
proc linkifyText*(text: string; prefs: Prefs; rss=false): string =
 | 
			
		||||
  result = xmltree.escape(stripText(text))
 | 
			
		||||
  result = result.replace(ellipsisRegex, " ")
 | 
			
		||||
  result = result.replace(emailRegex, reEmailToLink)
 | 
			
		||||
  if rss:
 | 
			
		||||
    result = result.replace(urlRegex, reUrlToLink)
 | 
			
		||||
    result = result.replace(usernameRegex, reUsernameToFullLink)
 | 
			
		||||
  else:
 | 
			
		||||
    result = result.replace(urlRegex, reUrlToShortLink)
 | 
			
		||||
    result = result.replace(usernameRegex, reUsernameToLink)
 | 
			
		||||
  result = result.replace(hashtagRegex, reHashtagToLink)
 | 
			
		||||
  result = result.replace(re"([^\s\(\n%])<a", "$1 <a")
 | 
			
		||||
  result = result.replace(re"</a>\s+([;.,!\)'%]|')", "</a>$1")
 | 
			
		||||
  result = result.replace(re"^\. <a", ".<a")
 | 
			
		||||
  result = result.replaceUrl(prefs)
 | 
			
		||||
 | 
			
		||||
proc stripTwitterUrls*(text: string): string =
 | 
			
		||||
  result = text
 | 
			
		||||
  result = result.replace(picRegex, "")
 | 
			
		||||
  result = result.replace(ellipsisRegex, "")
 | 
			
		||||
    result = result.replace("href=\"/", "href=\"" & hostname & "/")
 | 
			
		||||
 | 
			
		||||
proc proxifyVideo*(manifest: string; proxy: bool): string =
 | 
			
		||||
  proc cb(m: RegexMatch; s: string): string =
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,9 +1,10 @@
 | 
			
		|||
import xmltree, strtabs, strformat, strutils, times
 | 
			
		||||
import xmltree, strtabs, strformat, strutils, times, uri
 | 
			
		||||
import regex
 | 
			
		||||
 | 
			
		||||
import types, formatters
 | 
			
		||||
 | 
			
		||||
from q import nil
 | 
			
		||||
from htmlgen import a
 | 
			
		||||
 | 
			
		||||
const
 | 
			
		||||
  thumbRegex = re".+:url\('([^']+)'\)"
 | 
			
		||||
| 
						 | 
				
			
			@ -41,29 +42,41 @@ proc isVerified*(profile: XmlNode): bool =
 | 
			
		|||
proc isProtected*(profile: XmlNode): bool =
 | 
			
		||||
  getHeader(profile).select(".Icon.Icon--protected") != nil
 | 
			
		||||
 | 
			
		||||
proc emojify*(node: XmlNode) =
 | 
			
		||||
  for i in node.selectAll(".Emoji"):
 | 
			
		||||
    i.add newText(i.attr("alt"))
 | 
			
		||||
proc parseText*(text: XmlNode; skipLink=""): string =
 | 
			
		||||
  for el in text:
 | 
			
		||||
    case el.kind
 | 
			
		||||
    of xnText:
 | 
			
		||||
      result.add el
 | 
			
		||||
    of xnElement:
 | 
			
		||||
      if el.attrs == nil:
 | 
			
		||||
        if el.tag == "strong":
 | 
			
		||||
          result.add $el
 | 
			
		||||
        continue
 | 
			
		||||
 | 
			
		||||
      let class = el.attr("class")
 | 
			
		||||
      if "data-expanded-url" in el.attrs:
 | 
			
		||||
        let url = el.attr("data-expanded-url")
 | 
			
		||||
        if url == skipLink: continue
 | 
			
		||||
        elif "u-hidden" in class: result.add "\n"
 | 
			
		||||
        result.add a(shortLink(url), href=url)
 | 
			
		||||
      elif "ashtag" in class:
 | 
			
		||||
        let hash = el.innerText()
 | 
			
		||||
        result.add a(hash, href=("/search?q=" & encodeUrl(hash)))
 | 
			
		||||
      elif "atreply" in class:
 | 
			
		||||
        result.add a(el.innerText(), href=el.attr("href"))
 | 
			
		||||
      elif "Emoji" in class:
 | 
			
		||||
        result.add el.attr("alt")
 | 
			
		||||
    else: discard
 | 
			
		||||
 | 
			
		||||
proc getQuoteText*(tweet: XmlNode): string =
 | 
			
		||||
  let text = tweet.select(".QuoteTweet-text")
 | 
			
		||||
  emojify(text)
 | 
			
		||||
  result = stripText(text.innerText())
 | 
			
		||||
  result = stripTwitterUrls(result)
 | 
			
		||||
  parseText(tweet.select(".QuoteTweet-text"))
 | 
			
		||||
 | 
			
		||||
proc getTweetText*(tweet: XmlNode): string =
 | 
			
		||||
  let
 | 
			
		||||
    quote = tweet.select(".QuoteTweet")
 | 
			
		||||
    text = tweet.select(".tweet-text")
 | 
			
		||||
    link = text.selectAttr("a.twitter-timeline-link.u-hidden", "data-expanded-url")
 | 
			
		||||
 | 
			
		||||
  emojify(text)
 | 
			
		||||
  result = stripText(text.innerText())
 | 
			
		||||
 | 
			
		||||
  if quote != nil and link.len > 0:
 | 
			
		||||
    result = result.replace(link, "")
 | 
			
		||||
 | 
			
		||||
  result = stripTwitterUrls(result)
 | 
			
		||||
  parseText(text, if quote != nil: link else: "")
 | 
			
		||||
 | 
			
		||||
proc getTime(tweet: XmlNode): XmlNode =
 | 
			
		||||
  tweet.select(".js-short-timestamp")
 | 
			
		||||
| 
						 | 
				
			
			@ -87,10 +100,10 @@ proc getUsername*(profile: XmlNode; selector: string): string =
 | 
			
		|||
  profile.selectText(selector).strip(chars={'@', ' ', '\n'})
 | 
			
		||||
 | 
			
		||||
proc getBio*(profile: XmlNode; selector: string; fallback=""): string =
 | 
			
		||||
  var bio = profile.selectText(selector)
 | 
			
		||||
  if bio.len == 0 and fallback.len > 0:
 | 
			
		||||
    bio = profile.selectText(fallback)
 | 
			
		||||
  stripText(bio)
 | 
			
		||||
  var bio = profile.select(selector)
 | 
			
		||||
  if bio == nil and fallback.len > 0:
 | 
			
		||||
    bio = profile.select(fallback)
 | 
			
		||||
  parseText(bio)
 | 
			
		||||
 | 
			
		||||
proc getLocation*(profile: XmlNode): string =
 | 
			
		||||
  let sel = ".ProfileHeaderCard-locationText"
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -25,7 +25,7 @@ proc renderProfileCard*(profile: Profile; prefs: Prefs): VNode =
 | 
			
		|||
    tdiv(class="profile-card-extra"):
 | 
			
		||||
      if profile.bio.len > 0:
 | 
			
		||||
        tdiv(class="profile-bio"):
 | 
			
		||||
          p: verbatim linkifyText(profile.bio, prefs)
 | 
			
		||||
          p: verbatim replaceUrl(profile.bio, prefs)
 | 
			
		||||
 | 
			
		||||
      if profile.location.len > 0:
 | 
			
		||||
        tdiv(class="profile-location"):
 | 
			
		||||
| 
						 | 
				
			
			@ -39,8 +39,9 @@ proc renderProfileCard*(profile: Profile; prefs: Prefs): VNode =
 | 
			
		|||
      if profile.website.len > 0:
 | 
			
		||||
        tdiv(class="profile-website"):
 | 
			
		||||
          span:
 | 
			
		||||
            let url = replaceUrl(profile.website, prefs)
 | 
			
		||||
            icon "link"
 | 
			
		||||
            verbatim linkifyText(profile.website, prefs)
 | 
			
		||||
            a(href=url): text shortLink(url)
 | 
			
		||||
 | 
			
		||||
      tdiv(class="profile-joindate"):
 | 
			
		||||
        span(title=getJoinDateFull(profile)):
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -7,7 +7,7 @@
 | 
			
		|||
#if tweet.pinned: result = "Pinned: "
 | 
			
		||||
#elif tweet.retweet.isSome: result = "RT: "
 | 
			
		||||
#end if
 | 
			
		||||
#result &= xmltree.escape(replaceUrl(tweet.text, prefs))
 | 
			
		||||
#result &= xmltree.escape(replaceUrl(tweet.text, prefs, rss=true))
 | 
			
		||||
#if result.len > 0: return
 | 
			
		||||
#end if
 | 
			
		||||
#if tweet.photos.len > 0:
 | 
			
		||||
| 
						 | 
				
			
			@ -20,7 +20,7 @@
 | 
			
		|||
#end proc
 | 
			
		||||
#
 | 
			
		||||
#proc renderRssTweet(tweet: Tweet; prefs: Prefs): string =
 | 
			
		||||
#let text = linkifyText(tweet.text, prefs, rss=true)
 | 
			
		||||
#let text = replaceUrl(tweet.text, prefs, rss=true)
 | 
			
		||||
#if tweet.quote.isSome and get(tweet.quote).available:
 | 
			
		||||
#let quoteLink = hostname & getLink(get(tweet.quote))
 | 
			
		||||
<p>${text}<br><a href="https://${quoteLink}">${quoteLink}</a></p>
 | 
			
		||||
| 
						 | 
				
			
			@ -58,7 +58,7 @@
 | 
			
		|||
#end proc
 | 
			
		||||
#
 | 
			
		||||
#proc renderTimelineRss*(timeline: Timeline; profile: Profile): string =
 | 
			
		||||
#let prefs = Prefs(replaceTwitter: hostname)
 | 
			
		||||
#let prefs = Prefs(replaceTwitter: hostname, replaceYoutube: "invidio.us")
 | 
			
		||||
#result = ""
 | 
			
		||||
<?xml version="1.0" encoding="UTF-8"?>
 | 
			
		||||
<rss xmlns:atom="http://www.w3.org/2005/Atom" xmlns:dc="http://purl.org/dc/elements/1.1/" version="2.0">
 | 
			
		||||
| 
						 | 
				
			
			@ -84,7 +84,7 @@
 | 
			
		|||
#end proc
 | 
			
		||||
#
 | 
			
		||||
#proc renderListRss*(tweets: seq[Tweet]; name, list: string): string =
 | 
			
		||||
#let prefs = Prefs(replaceTwitter: hostname)
 | 
			
		||||
#let prefs = Prefs(replaceTwitter: hostname, replaceYoutube: "invidio.us")
 | 
			
		||||
#let link = &"https://{hostname}/{name}/lists/{list}"
 | 
			
		||||
#result = ""
 | 
			
		||||
<?xml version="1.0" encoding="UTF-8"?>
 | 
			
		||||
| 
						 | 
				
			
			@ -102,7 +102,7 @@
 | 
			
		|||
#end proc
 | 
			
		||||
#
 | 
			
		||||
#proc renderSearchRss*(tweets: seq[Tweet]; name, param: string): string =
 | 
			
		||||
#let prefs = Prefs(replaceTwitter: hostname)
 | 
			
		||||
#let prefs = Prefs(replaceTwitter: hostname, replaceYoutube: "invidio.us")
 | 
			
		||||
#let link = &"https://{hostname}/search"
 | 
			
		||||
#result = ""
 | 
			
		||||
<?xml version="1.0" encoding="UTF-8"?>
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -56,7 +56,7 @@ proc renderUser(user: Profile; prefs: Prefs): VNode =
 | 
			
		|||
        linkUser(user, class="username")
 | 
			
		||||
 | 
			
		||||
      tdiv(class="tweet-content media-body"):
 | 
			
		||||
        verbatim linkifyText(user.bio, prefs)
 | 
			
		||||
        verbatim replaceUrl(user.bio, prefs)
 | 
			
		||||
 | 
			
		||||
proc renderTimelineUsers*(results: Result[Profile]; prefs: Prefs; path=""): VNode =
 | 
			
		||||
  buildHtml(tdiv(class="timeline")):
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -215,7 +215,7 @@ proc renderQuote(quote: Quote; prefs: Prefs): VNode =
 | 
			
		|||
      renderReply(quote)
 | 
			
		||||
 | 
			
		||||
    tdiv(class="quote-text"):
 | 
			
		||||
      verbatim linkifyText(quote.text, prefs)
 | 
			
		||||
      verbatim replaceUrl(quote.text, prefs)
 | 
			
		||||
 | 
			
		||||
    if quote.hasThread:
 | 
			
		||||
      a(class="show-thread", href=getLink(quote)):
 | 
			
		||||
| 
						 | 
				
			
			@ -248,7 +248,7 @@ proc renderTweet*(tweet: Tweet; prefs: Prefs; path: string; class="";
 | 
			
		|||
        renderReply(tweet)
 | 
			
		||||
 | 
			
		||||
      tdiv(class="tweet-content media-body"):
 | 
			
		||||
        verbatim linkifyText(tweet.text, prefs)
 | 
			
		||||
        verbatim replaceUrl(tweet.text, prefs)
 | 
			
		||||
 | 
			
		||||
      if tweet.quote.isSome:
 | 
			
		||||
        renderQuote(tweet.quote.get(), prefs)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -51,7 +51,7 @@ link = [
 | 
			
		|||
        'old.reddit.com/r/programming…'
 | 
			
		||||
    ]],
 | 
			
		||||
    ['nim_lang/status/1125887775151140864', [
 | 
			
		||||
        'en.wikipedia.org/wiki/Nim_(p…)'
 | 
			
		||||
        'en.wikipedia.org/wiki/Nim_(p…'
 | 
			
		||||
    ]],
 | 
			
		||||
    ['hiankun_taioan/status/1086916335215341570', [
 | 
			
		||||
        '(hackernoon.com/interview-wit…)'
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue