urlhaus-filter/src/clean_url.js

147 lines
4.3 KiB
JavaScript

'use strict'
import { createInterface } from 'node:readline'
const cleanHost = (hostname) => {
return hostname
// Remove invalid protocol, see #32
.replace(/^(https?:\/\/)(?:ttps:\/\/|https:\/|http\/)/, '$1')
.replace(/^(https?:\/\/)?www\./, '$1')
}
// nodejs does not percent-encode ^ yet
// https://github.com/nodejs/node/issues/57313
// Applies to path, exclude query string
const caretPath = (pathname) => {
if (!pathname.includes('?')) return pathname.replaceAll('^', '%5E')
const pathArray = pathname.split('?')
const path = pathArray[0].replaceAll('^', '%5E')
const search = pathArray.slice(1).join('?')
return `${path}?${search}`
}
const safeLinks = [
'safelinks\\.protection\\.outlook\\.com',
'\\.protection\\.sophos\\.com',
'linkprotect\\.cudasvc\\.com'
]
const deSafelink = (urlStr) => {
let url = new URL(urlStr)
// O365 Safelinks
if (url.hostname.endsWith('safelinks.protection.outlook.com')) {
url = new URL(url.searchParams.get('url'))
}
// Sophos
if (url.hostname.endsWith('.protection.sophos.com')) {
url = new URL(`http://${url.searchParams.get('d')}`)
}
// Barracuda
if (url.hostname.endsWith('linkprotect.cudasvc.com')) {
url = new URL(url.searchParams.get('a'))
}
// ShopMy & Disqus
if ((url.hostname === 'api.shopmy.us' && url.pathname === '/api/redirect_click') || url.hostname === 'disq.us') {
url = new URL(url.searchParams.get('url'))
}
// VKontakte
if ((url.hostname === 'vk.com' || url.hostname === 'vkontakte.ru') && url.pathname === '/away.php') {
url = new URL(url.searchParams.get('to'))
}
// WhatsApp
if (url.hostname === 'l.wl.co' && url.pathname === '/l') {
url = new URL(url.searchParams.get('u'))
}
// Google Ads
if (url.hostname.endsWith('doubleclick.net') || url.hostname.endsWith('googleadservices.com')) {
let paramUrl = url.searchParams.getAll('adurl').at(-1) || url.searchParams.getAll('url').at(-1) || url.searchParams.getAll('ds_dest_url').at(-1)
if (paramUrl) {
paramUrl = paramUrl.replace(/^\/\//, 'https://')
url = new URL(paramUrl)
}
}
// Google Search
// Google AMP does not redirect (e.g. google.com/amp/example.com)
if (url.hostname.endsWith('google.com') && (url.pathname.startsWith('/url') || url.pathname.startsWith('/travel/clk'))) {
const paramUrl = url.searchParams.get('q') || url.searchParams.get('url') || url.searchParams.get('pcurl')
if (paramUrl) url = new URL(paramUrl)
}
// SES
// https://github.com/uBlockOrigin/uAssets/blob/42e518277ab0c36d4b131aa01b4a8828af4e18b6/filters/privacy.txt#L866
if (url.hostname.endsWith('awstrack.me' && url.pathname.startsWith('/L0'))) {
url = new URL(decodeURIComponent(url.pathname.match(/\/L0\/(http[^\/?#]+)/)[1]))
}
// DuckDuckGo
if (url.hostname === 'duckduckgo.com' && url.pathname === '/l/') {
url = new URL(url.searchParams.get('uddg'))
}
if (url.hostname.match(new RegExp(safeLinks.join('|')))) {
return deSafelink(url.href)
}
return url.href
}
for await (const line of createInterface({ input: process.stdin, terminal: false })) {
// parse hostname from url
if (process.argv[2] === 'hostname') {
if (URL.canParse(`http://${line}`)) {
const url = new URL(`http://${line}`)
console.log(url.hostname)
} else {
const hostname = line
// host
.split('/')[0]
// exclude credential
.replace(/.*@(.+)/, '$1')
// exclude port
.replace(/:\d+$/, '')
// #2
.split('?')[0]
console.log(hostname)
}
} else {
// Skip invalid domains, see #15
if (line.split('/')[2].includes('??')) continue
if (URL.canParse(line)) {
const url = new URL(deSafelink(cleanHost(line)))
url.host = cleanHost(url.host)
// nodejs does not percent-encode ^ yet
// https://github.com/nodejs/node/issues/57313
url.pathname = caretPath(url.pathname)
const outUrl = `${url.host}${url.pathname}${url.search}`
// remove trailing slash from domain except path
.replace(/(^[^/]*)\/+$/, '$1')
console.log(outUrl)
} else {
const outUrl = caretPath(cleanHost(line
// remove protocol
.split('/').slice(2).join('/')))
// url encode space
.replaceAll(' ', '%20')
.replace(/(^[^/]*)\/+$/, '$1')
console.log(outUrl)
}
}
}