新增构建OpenSSL镜像相关文件

This commit is contained in:
2024-03-15 14:52:38 +08:00
committed by huty
parent 43337c1a0b
commit 132c17af2d
10119 changed files with 1581963 additions and 0 deletions

View File

@@ -0,0 +1,189 @@
import addressparser from 'addressparser'
import axios from 'axios'
import { parseHTML } from 'linkedom'
import { Browser } from 'puppeteer-core'
import { v4 as uuid } from 'uuid'
interface Unsubscribe {
mailTo?: string
httpUrl?: string
}
export interface NewsletterInput {
from: string
to: string
subject: string
html: string
headers: Record<string, string | string[]>
}
export interface NewsletterResult {
email: string
content: string
url: string
title: string
author: string
unsubMailTo?: string
unsubHttpUrl?: string
}
export interface PreHandleResult {
url?: string
title?: string
content?: string
contentType?: string
dom?: Document
}
export const FAKE_URL_PREFIX = 'https://omnivore.app/no_url?q='
export const generateUniqueUrl = () => FAKE_URL_PREFIX + uuid()
export abstract class ContentHandler {
protected senderRegex: RegExp
protected urlRegex: RegExp
name: string
protected constructor() {
this.senderRegex = new RegExp(/NEWSLETTER_SENDER_REGEX/)
this.urlRegex = new RegExp(/NEWSLETTER_URL_REGEX/)
this.name = 'Handler name'
}
shouldResolve(url: string): boolean {
return false
}
async resolve(url: string): Promise<string | undefined> {
return Promise.resolve(url)
}
shouldPreHandle(url: string): boolean {
return false
}
async preHandle(url: string, browser?: Browser): Promise<PreHandleResult> {
return Promise.resolve({ url })
}
shouldPreParse(url: string, dom: Document): boolean {
return false
}
async preParse(url: string, dom: Document): Promise<Document> {
return Promise.resolve(dom)
}
async isNewsletter(input: {
from: string
html: string
headers: Record<string, string | string[]>
dom: Document
}): Promise<boolean> {
const re = new RegExp(this.senderRegex)
const postHeader = input.headers['list-post']
const unSubHeader = input.headers['list-unsubscribe']
return Promise.resolve(
re.test(input.from) && (!!postHeader || !!unSubHeader)
)
}
findNewsletterHeaderHref(dom: Document): string | undefined {
return undefined
}
// Given an HTML blob tries to find a URL to use for
// a canonical URL.
async findNewsletterUrl(html: string): Promise<string | undefined> {
const dom = parseHTML(html).document
// Check if this is a substack newsletter
const href = this.findNewsletterHeaderHref(dom)
if (href) {
// Try to make a HEAD request, so we get the redirected URL, since these
// will usually be behind tracking url redirects
try {
const response = await axios.head(href, { timeout: 5000 })
return Promise.resolve(
// eslint-disable-next-line @typescript-eslint/no-unsafe-member-access
response.request.res.responseUrl as string | undefined
)
} catch (e) {
console.log('error making HEAD request', e)
return Promise.resolve(href)
}
}
return Promise.resolve(undefined)
}
async parseNewsletterUrl(
headers: Record<string, string | string[]>,
html: string
): Promise<string | undefined> {
// get url from dom
const url = await this.findNewsletterUrl(html)
if (url) {
return url
}
// get newsletter url from html
const matches = html.match(this.urlRegex)
if (matches) {
return matches[1]
}
return undefined
}
parseAuthor(from: string): string {
// get author name from email
// e.g. 'Jackson Harper from Omnivore App <jacksonh@substack.com>'
// or 'Mike Allen <mike@axios.com>'
const parsed = addressparser(from)
if (parsed.length > 0 && parsed[0].name) {
return parsed[0].name
}
return from
}
parseUnsubscribe(unSubHeader: string): Unsubscribe {
// parse list-unsubscribe header
// e.g. List-Unsubscribe: <https://omnivore.com/unsub>, <mailto:unsub@omnivore.com>
return {
httpUrl: unSubHeader.match(/<(https?:\/\/[^>]*)>/)?.[1],
mailTo: unSubHeader.match(/<mailto:([^>]*)>/)?.[1],
}
}
async handleNewsletter({
from,
to,
subject,
html,
headers,
}: NewsletterInput): Promise<NewsletterResult> {
console.log('handleNewsletter', from, to, subject, headers)
if (!from || !html || !subject || !to) {
console.log('invalid newsletter email')
throw new Error('invalid newsletter email')
}
// fallback to default url if newsletter url does not exist
// assign a random uuid to the default url to avoid duplicate url
const url =
(await this.parseNewsletterUrl(headers, html)) || generateUniqueUrl()
const author = this.parseAuthor(from)
const unsubscribe = headers['list-unsubscribe']
? this.parseUnsubscribe(headers['list-unsubscribe'].toString())
: undefined
return {
email: to,
content: html,
url,
title: subject,
author,
unsubMailTo: unsubscribe?.mailTo || '',
unsubHttpUrl: unsubscribe?.httpUrl || '',
}
}
}

View File

@@ -0,0 +1,186 @@
import { parseHTML } from 'linkedom'
import { Browser } from 'puppeteer-core'
import {
ContentHandler,
NewsletterInput,
NewsletterResult,
PreHandleResult,
} from './content-handler'
import { AxiosHandler } from './newsletters/axios-handler'
import { BeehiivHandler } from './newsletters/beehiiv-handler'
import { BloombergNewsletterHandler } from './newsletters/bloomberg-newsletter-handler'
import { ConvertkitHandler } from './newsletters/convertkit-handler'
import { CooperPressHandler } from './newsletters/cooper-press-handler'
import { EnergyWorldHandler } from './newsletters/energy-world'
import { EveryIoHandler } from './newsletters/every-io-handler'
import { GenericHandler } from './newsletters/generic-handler'
import { GhostHandler } from './newsletters/ghost-handler'
import { GolangHandler } from './newsletters/golang-handler'
import { HeyWorldHandler } from './newsletters/hey-world-handler'
import { IndiaTimesHandler } from './newsletters/india-times-handler'
import { MorningBrewHandler } from './newsletters/morning-brew-handler'
import { RevueHandler } from './newsletters/revue-handler'
import { SubstackHandler } from './newsletters/substack-handler'
import { AppleNewsHandler } from './websites/apple-news-handler'
import { ArsTechnicaHandler } from './websites/ars-technica-handler'
import { BloombergHandler } from './websites/bloomberg-handler'
import { DerstandardHandler } from './websites/derstandard-handler'
import { GitHubHandler } from './websites/github-handler'
import { ImageHandler } from './websites/image-handler'
import { MediumHandler } from './websites/medium-handler'
import { NitterHandler } from './websites/nitter-handler'
import { PdfHandler } from './websites/pdf-handler'
import { PipedVideoHandler } from './websites/piped-video-handler'
import { ScrapingBeeHandler } from './websites/scrapingBee-handler'
import { StackOverflowHandler } from './websites/stack-overflow-handler'
import { TDotCoHandler } from './websites/t-dot-co-handler'
import { TheAtlanticHandler } from './websites/the-atlantic-handler'
import { WeixinQqHandler } from './websites/weixin-qq-handler'
import { WikipediaHandler } from './websites/wikipedia-handler'
import { YoutubeHandler } from './websites/youtube-handler'
import { ZhihuHandler } from './websites/zhihu-handler'
const validateUrlString = (url: string): boolean => {
const u = new URL(url)
// Make sure the URL is http or https
if (u.protocol !== 'http:' && u.protocol !== 'https:') {
throw new Error('Invalid URL protocol check failed')
}
// Make sure the domain is not localhost
if (u.hostname === 'localhost' || u.hostname === '0.0.0.0') {
throw new Error('Invalid URL is localhost')
}
// Make sure the domain is not a private IP
if (/^(10|172\.16|192\.168)\..*/.test(u.hostname)) {
throw new Error('Invalid URL is private ip')
}
return true
}
const contentHandlers: ContentHandler[] = [
new ArsTechnicaHandler(),
new TheAtlanticHandler(),
new AppleNewsHandler(),
new BloombergHandler(),
new DerstandardHandler(),
new ImageHandler(),
new MediumHandler(),
new PdfHandler(),
new ScrapingBeeHandler(),
new TDotCoHandler(),
new YoutubeHandler(),
new WikipediaHandler(),
new GitHubHandler(),
new AxiosHandler(),
new GolangHandler(),
new MorningBrewHandler(),
new BloombergNewsletterHandler(),
new SubstackHandler(),
new StackOverflowHandler(),
new EnergyWorldHandler(),
new PipedVideoHandler(),
new WeixinQqHandler(),
new NitterHandler(),
new ZhihuHandler(),
]
const newsletterHandlers: ContentHandler[] = [
new AxiosHandler(),
new BloombergNewsletterHandler(),
new GolangHandler(),
new SubstackHandler(),
new MorningBrewHandler(),
new BeehiivHandler(),
new ConvertkitHandler(),
new RevueHandler(),
new GhostHandler(),
new CooperPressHandler(),
new HeyWorldHandler(),
new GenericHandler(),
new EveryIoHandler(),
new EnergyWorldHandler(),
new IndiaTimesHandler(),
]
export const preHandleContent = async (
url: string,
browser: Browser
): Promise<PreHandleResult | undefined> => {
// Before we run the regular handlers we check to see if we need tp
// pre-resolve the URL. TODO: This should probably happen recursively,
// so URLs can be pre-resolved, handled, pre-resolved, handled, etc.
for (const handler of contentHandlers) {
if (handler.shouldResolve(url)) {
try {
const resolvedUrl = await handler.resolve(url)
if (resolvedUrl && validateUrlString(resolvedUrl)) {
url = resolvedUrl
}
} catch (err) {
console.log('error resolving url with handler', handler.name, err)
}
break
}
}
// Before we fetch the page we check the handlers, to see if they want
// to perform a prefetch action that can modify our requests.
// enumerate the handlers and see if any of them want to handle the request
for (const handler of contentHandlers) {
if (handler.shouldPreHandle(url)) {
console.log('preHandleContent', handler.name, url)
return handler.preHandle(url, browser)
}
}
return undefined
}
export const preParseContent = async (
url: string,
dom: Document
): Promise<Document | undefined> => {
// Before we parse the page we check the handlers, to see if they want
// to perform a preParse action that can modify our dom.
// enumerate the handlers and see if any of them want to handle the dom
for (const handler of contentHandlers) {
if (handler.shouldPreParse(url, dom)) {
console.log('preParseContent', handler.name, url)
return handler.preParse(url, dom)
}
}
return undefined
}
export const getNewsletterHandler = async (input: {
from: string
html: string
headers: Record<string, string | string[]>
}): Promise<ContentHandler | undefined> => {
const dom = parseHTML(input.html).document
for (const handler of newsletterHandlers) {
if (await handler.isNewsletter({ ...input, dom })) {
return handler
}
}
return undefined
}
export const handleNewsletter = async (
input: NewsletterInput
): Promise<NewsletterResult | undefined> => {
const handler = await getNewsletterHandler(input)
if (handler) {
console.log('handleNewsletter', handler.name, input.subject)
return handler.handleNewsletter(input)
}
return undefined
}
module.exports = {
preHandleContent,
handleNewsletter,
preParseContent,
getNewsletterHandler,
}

View File

@@ -0,0 +1,46 @@
import { ContentHandler } from '../content-handler'
export class AxiosHandler extends ContentHandler {
constructor() {
super()
this.senderRegex = /<.+@axios.com>/
this.urlRegex = /View in browser at <a.*>(.*)<\/a>/
this.name = 'axios'
}
shouldPreParse(url: string, dom: Document): boolean {
const host = this.name + '.com'
// check if url ends with axios.com
return new URL(url).hostname.endsWith(host)
}
async preParse(url: string, dom: Document): Promise<Document> {
const body = dom.querySelector('table')
let isFooter = false
// this removes ads and replaces table with a div
body?.querySelectorAll('table').forEach((el) => {
// remove the footer and the ads
if (!el.textContent || el.textContent.length < 20 || isFooter) {
el.remove()
} else {
// removes the first few rows of the table (the header)
// remove the last two rows of the table (they are ads)
el.querySelectorAll('tr').forEach((tr, i) => {
if (i <= 7 || i >= el.querySelectorAll('tr').length - 2) {
console.log('removing', tr)
tr.remove()
}
})
// replace the table with a div
const div = dom.createElement('div')
div.innerHTML = el.innerHTML
el.parentNode?.replaceChild(div, el)
// set the isFooter flag to true because the next table is the footer
isFooter = true
}
})
return Promise.resolve(dom)
}
}

View File

@@ -0,0 +1,24 @@
import { ContentHandler } from '../content-handler'
export class BeehiivHandler extends ContentHandler {
constructor() {
super()
this.name = 'beehiiv'
}
async isNewsletter(input: {
from: string
headers: Record<string, string | string[]>
}): Promise<boolean> {
return Promise.resolve(
input.headers['x-beehiiv-type']?.toString() === 'newsletter'
)
}
async parseNewsletterUrl(
headers: Record<string, string | string[]>,
html: string
): Promise<string | undefined> {
return Promise.resolve(headers['x-newsletter']?.toString())
}
}

View File

@@ -0,0 +1,37 @@
import { ContentHandler } from '../content-handler'
export class BloombergNewsletterHandler extends ContentHandler {
constructor() {
super()
this.senderRegex = /<.+@mail.bloomberg.*.com>/
this.urlRegex = /<a class="view-in-browser__url" href=["']([^"']*)["']/
this.name = 'bloomberg'
}
shouldPreParse(url: string, dom: Document): boolean {
const host = this.name + '.com'
// check if url ends with bloomberg.com
return (
new URL(url).hostname.endsWith(host) ||
dom.querySelector('.logo-image')?.getAttribute('alt')?.toLowerCase() ===
this.name
)
}
async preParse(url: string, dom: Document): Promise<Document> {
const body = dom.querySelector('.wrapper')
// this removes header
body?.querySelector('.sailthru-variables')?.remove()
body?.querySelector('.preview-text')?.remove()
body?.querySelector('.logo-wrapper')?.remove()
body?.querySelector('.by-the-number-wrapper')?.remove()
// this removes footer
body?.querySelector('.quote-box-wrapper')?.remove()
body?.querySelector('.header-wrapper')?.remove()
body?.querySelector('.component-wrapper')?.remove()
body?.querySelector('.footer')?.remove()
return Promise.resolve(dom)
}
}

View File

@@ -0,0 +1,52 @@
import { ContentHandler } from '../content-handler'
export class ConvertkitHandler extends ContentHandler {
constructor() {
super()
this.name = 'convertkit'
}
findNewsletterHeaderHref(dom: Document): string | undefined {
const readOnline = dom.querySelectorAll('a')
let res: string | undefined = undefined
readOnline.forEach((e) => {
if (
e.textContent === 'View this email in your browser' ||
e.textContent === 'Read on FS'
) {
res = e.getAttribute('href') || undefined
}
})
return res
}
async isNewsletter(input: {
from: string
dom: Document
headers: Record<string, string | string[]>
}): Promise<boolean> {
const dom = input.dom
const icons = dom.querySelectorAll(
'img[src*="convertkit.com"], img[src*="convertkit-mail"]'
)
if (icons.length === 0) {
return Promise.resolve(false)
}
// ignore newsletters that have a confirmation link to the newsletter in the body
const links = dom.querySelectorAll(
'a[href*="convertkit.com"], a[href*="convertkit-mail"]'
)
const isConfirmation = Array.from(links).some((e) => {
return e.textContent === 'Confirm your subscription'
})
return Promise.resolve(!isConfirmation)
}
async parseNewsletterUrl(
headers: Record<string, string | string[]>,
html: string
): Promise<string | undefined> {
return this.findNewsletterUrl(html)
}
}

View File

@@ -0,0 +1,37 @@
import { ContentHandler } from '../content-handler'
export class CooperPressHandler extends ContentHandler {
constructor() {
super()
this.name = 'cooper-press'
}
findNewsletterHeaderHref(dom: Document): string | undefined {
const readOnline = dom.querySelectorAll('a')
let res: string | undefined = undefined
readOnline.forEach((e) => {
if (e.textContent === 'Read on the Web') {
res = e.getAttribute('href') || undefined
}
})
return res
}
async isNewsletter(input: {
from: string
dom: Document
headers: Record<string, string | string[]>
}): Promise<boolean> {
const dom = input.dom
return Promise.resolve(
dom.querySelectorAll('a[href*="cooperpress.com"]').length > 0
)
}
async parseNewsletterUrl(
headers: Record<string, string | string[]>,
html: string
): Promise<string | undefined> {
return this.findNewsletterUrl(html)
}
}

View File

@@ -0,0 +1,44 @@
import { ContentHandler } from '../content-handler'
export class EnergyWorldHandler extends ContentHandler {
constructor() {
super()
this.name = 'Energy World'
}
async isNewsletter(input: {
from: string
html: string
headers: Record<string, string | string[]>
dom: Document
}): Promise<boolean> {
return Promise.resolve(
input.from === 'ETEnergyworld Latest News<newsletter@etenergyworld.com>'
)
}
shouldPreParse(url: string, dom: Document): boolean {
return dom.querySelectorAll('img[src*="etenergyworld.png"]').length > 0
}
async preParse(url: string, dom: Document): Promise<Document> {
// get the main content
const main = dom.querySelector('table[class="nletter-wrap"]')
if (!main) {
return Promise.resolve(dom)
}
// create a new dom
const newDom = dom.createDocumentFragment()
// add the content to the new dom
main.querySelectorAll('table[class="multi-cols"] tr').forEach((tr) => {
const p = dom.createElement('p')
p.innerHTML = tr.innerHTML
newDom.appendChild(p)
})
dom.body.replaceChildren(newDom)
return Promise.resolve(dom)
}
}

View File

@@ -0,0 +1,22 @@
import { ContentHandler } from '../content-handler'
export class EveryIoHandler extends ContentHandler {
constructor() {
super()
this.name = 'Every.io'
}
async isNewsletter(input: {
from: string
html: string
headers: Record<string, string | string[]>
dom: Document
}): Promise<boolean> {
return Promise.resolve(input.from === 'Every <hello@every.to>')
}
findNewsletterHeaderHref(dom: Document): string | undefined {
const readOnline = dom.querySelector('.newsletter-email .title a')
return readOnline?.getAttribute('href') || undefined
}
}

View File

@@ -0,0 +1,49 @@
import { ContentHandler } from '../content-handler'
import addressparser from 'addressparser'
export class GenericHandler extends ContentHandler {
// newsletter url text regex for newsletters that don't have a newsletter header
NEWSLETTER_URL_TEXT_REGEX =
/((View|Read)(.*)(email|post)?(.*)(in your browser|online|on (FS|the Web))|Lire en ligne)/i
constructor() {
super()
this.name = 'Generic Newsletter'
}
async isNewsletter(input: {
from: string
html: string
headers: Record<string, string | string[]>
dom: Document
}): Promise<boolean> {
const postHeader = input.headers['list-post'] || input.headers['list-id']
const unSubHeader = input.headers['list-unsubscribe']
return Promise.resolve(!!postHeader || !!unSubHeader)
}
findNewsletterHeaderHref(dom: Document): string | undefined {
const readOnline = dom.querySelectorAll('a')
let res: string | undefined = undefined
readOnline.forEach((e) => {
if (e.textContent && this.NEWSLETTER_URL_TEXT_REGEX.test(e.textContent)) {
res = e.getAttribute('href') || undefined
}
})
return res
}
async parseNewsletterUrl(
headers: Record<string, string | string[]>,
html: string
): Promise<string | undefined> {
// raw SubStack newsletter url is like <https://hongbo130.substack.com/p/tldr>
// we need to get the real url from the raw url
const postHeader = headers['list-post']?.toString()
if (postHeader && addressparser(postHeader).length > 0) {
return addressparser(postHeader)[0].name
}
return this.findNewsletterUrl(html)
}
}

View File

@@ -0,0 +1,31 @@
import { ContentHandler } from '../content-handler'
export class GhostHandler extends ContentHandler {
constructor() {
super()
this.name = 'ghost'
}
findNewsletterHeaderHref(dom: Document): string | undefined {
const readOnline = dom.querySelector('.view-online-link')
return readOnline?.getAttribute('href') || undefined
}
async isNewsletter(input: {
from: string
dom: Document
headers: Record<string, string | string[]>
}): Promise<boolean> {
const dom = input.dom
return Promise.resolve(
dom.querySelectorAll('img[src*="ghost.org"]').length > 0
)
}
async parseNewsletterUrl(
headers: Record<string, string | string[]>,
html: string
): Promise<string | undefined> {
return this.findNewsletterUrl(html)
}
}

View File

@@ -0,0 +1,27 @@
import { ContentHandler } from '../content-handler'
export class GolangHandler extends ContentHandler {
constructor() {
super()
this.senderRegex = /<.+@golangweekly.com>/
this.urlRegex = /<a href=["']([^"']*)["'].*>Read on the Web<\/a>/
this.name = 'golangweekly'
}
shouldPreParse(url: string, dom: Document): boolean {
const host = this.name + '.com'
// check if url ends with golangweekly.com
return new URL(url).hostname.endsWith(host)
}
async preParse(url: string, dom: Document): Promise<Document> {
const body = dom.querySelector('body')
// this removes the "Subscribe" button
body?.querySelector('.el-splitbar')?.remove()
// this removes the title
body?.querySelector('.el-masthead')?.remove()
return Promise.resolve(dom)
}
}

View File

@@ -0,0 +1,27 @@
import { ContentHandler } from '../content-handler'
export class HeyWorldHandler extends ContentHandler {
constructor() {
super()
this.name = 'hey-world'
this.senderRegex = /<.+@world.hey.com>/
}
findNewsletterHeaderHref(dom: Document): string | undefined {
const readOnline = dom.querySelectorAll('a')
let res: string | undefined = undefined
readOnline.forEach((e) => {
if (e.textContent === 'View this post online') {
res = e.getAttribute('href') || undefined
}
})
return res
}
async parseNewsletterUrl(
headers: Record<string, string | string[]>,
html: string
): Promise<string | undefined> {
return this.findNewsletterUrl(html)
}
}

View File

@@ -0,0 +1,33 @@
import { ContentHandler } from '../content-handler'
import addressparser from 'addressparser'
export class IndiaTimesHandler extends ContentHandler {
constructor() {
super()
this.name = 'India Times'
}
async isNewsletter(input: {
from: string
html: string
headers: Record<string, string | string[]>
dom: Document
}): Promise<boolean> {
return Promise.resolve(
addressparser(input.from).some(
(e) => e.address === 'newsletters@timesofindia.com'
)
)
}
findNewsletterHeaderHref(dom: Document): string | undefined {
const readOnline = dom.querySelectorAll('a')
let res: string | undefined = undefined
readOnline.forEach((e) => {
if (e.textContent === 'view in browser') {
res = e.getAttribute('href') || undefined
}
})
return res
}
}

View File

@@ -0,0 +1,35 @@
import { ContentHandler } from '../content-handler'
export class MorningBrewHandler extends ContentHandler {
constructor() {
super()
this.senderRegex = /Morning Brew <crew@morningbrew.com>/
this.urlRegex = /<a.* href=["']([^"']*)["'].*>View Online<\/a>/
this.name = 'morningbrew'
}
shouldPreParse(url: string, dom: Document): boolean {
const host = this.name + '.com'
// check if url ends with morningbrew.com
return new URL(url).hostname.endsWith(host)
}
async preParse(url: string, dom: Document): Promise<Document> {
// retain the width of the cells in the table of market info
dom.querySelectorAll('.markets-arrow-cell').forEach((td) => {
const table = td.closest('table')
if (table) {
const bubbleTable = table.querySelector('.markets-bubble')
if (bubbleTable) {
// replace the nested table with the text
const e = bubbleTable.querySelector('.markets-table-text')
e && bubbleTable.parentNode?.replaceChild(e, bubbleTable)
}
// set custom class for the table
table.className = 'morning-brew-markets'
}
})
return Promise.resolve(dom)
}
}

View File

@@ -0,0 +1,44 @@
import { ContentHandler } from '../content-handler'
export class RevueHandler extends ContentHandler {
constructor() {
super()
this.name = 'revue'
}
findNewsletterHeaderHref(dom: Document): string | undefined {
const viewOnline = dom.querySelectorAll('table tr td a[target="_blank"]')
let res: string | undefined = undefined
viewOnline.forEach((e) => {
if (e.textContent === 'View online') {
res = e.getAttribute('href') || undefined
}
})
return res
}
async isNewsletter(input: {
from: string
dom: Document
headers: Record<string, string | string[]>
}): Promise<boolean> {
const dom = input.dom
if (
dom.querySelectorAll('img[src*="getrevue.co"], img[src*="revue.email"]')
.length > 0
) {
const getrevueUrl = this.findNewsletterHeaderHref(dom)
if (getrevueUrl) {
return Promise.resolve(true)
}
}
return false
}
async parseNewsletterUrl(
headers: Record<string, string | string[]>,
html: string
): Promise<string | undefined> {
return this.findNewsletterUrl(html)
}
}

View File

@@ -0,0 +1,139 @@
import addressparser from 'addressparser'
import { ContentHandler } from '../content-handler'
export class SubstackHandler extends ContentHandler {
constructor() {
super()
this.name = 'substack'
}
shouldPreParse(url: string, dom: Document): boolean {
const host = this.name + '.com'
const cdnHost = 'substackcdn.com'
// check if url ends with substack.com
// or has a profile image hosted at substack.com or substackcdn.com
return (
new URL(url).hostname.endsWith(host) ||
!!dom
.querySelector('.email-body img')
?.getAttribute('src')
?.includes(host || cdnHost)
)
}
async preParse(url: string, dom: Document): Promise<Document> {
const body = dom.querySelector('.email-body-container')
// this removes header and profile avatar
body?.querySelector('.header')?.remove()
body?.querySelector('.preamble')?.remove()
body?.querySelector('.meta-author-wrap')?.remove()
// this removes meta button
body?.querySelector('.post-meta')?.remove()
// this removes footer
body?.querySelector('.post-cta')?.remove()
body?.querySelector('.container-border')?.remove()
body?.querySelector('.footer')?.remove()
// this removes the "restack" button
body?.querySelector('.email-ufi-2-bottom')?.remove()
// this removes the "share" button
body?.querySelector('.email-ufi-2-top')?.remove()
dom = this.fixupStaticTweets(dom)
return Promise.resolve(dom)
}
findNewsletterHeaderHref(dom: Document): string | undefined {
// Substack header links
const postLink = dom.querySelector('h1 a')
if (postLink) {
return postLink.getAttribute('href') || undefined
}
return undefined
}
async isNewsletter({
headers,
dom,
}: {
from: string
headers: Record<string, string | string[]>
dom: Document
}): Promise<boolean> {
if (headers['list-post']) {
return Promise.resolve(true)
}
// substack newsletter emails have tables with a *post-meta class
if (dom.querySelector('table[class$="post-meta"]')) {
return true
}
// If the article has a header link, and substack icons its probably a newsletter
const href = this.findNewsletterHeaderHref(dom)
const oldHeartIcon = dom.querySelector(
'table tbody td span a img[src*="HeartIcon"]'
)
const oldRecommendIcon = dom.querySelector(
'table tbody td span a img[src*="RecommendIconRounded"]'
)
const heartIcon = dom.querySelector('a img[src*="LucideHeart"]')
const commentsIcon = dom.querySelector('a img[src*="LucideComments"]')
return Promise.resolve(
!!(
href &&
(oldHeartIcon || oldRecommendIcon || heartIcon || commentsIcon)
)
)
}
async parseNewsletterUrl(
headers: Record<string, string | string[]>,
html: string
): Promise<string | undefined> {
// raw SubStack newsletter url is like <https://hongbo130.substack.com/p/tldr>
// we need to get the real url from the raw url
const postHeader = headers['list-post']?.toString()
if (postHeader && addressparser(postHeader).length > 0) {
return Promise.resolve(addressparser(postHeader)[0].name)
}
return this.findNewsletterUrl(html)
}
fixupStaticTweets(dom: Document): Document {
const preClassName = '_omnivore-static-'
const staticTweets = dom.querySelectorAll('div[class="tweet static"]')
if (staticTweets.length < 1) {
return dom
}
const recurse = (node: Element, f: (node: Element) => void) => {
for (let i = 0; i < node.children.length; i++) {
const child = node.children[i]
recurse(child, f)
f(child)
}
}
for (const tweet of Array.from(staticTweets)) {
tweet.className = preClassName + 'tweet'
tweet.removeAttribute('style')
// get all children, rename their class, remove style
// elements (style will be handled in the reader)
recurse(tweet, (n: Element) => {
const className = n.className
if (
className.startsWith('tweet-') ||
className.startsWith('quote-tweet')
) {
n.className = preClassName + className
}
n.removeAttribute('style')
})
}
return dom
}
}

View File

@@ -0,0 +1,32 @@
import { createClient } from 'redis'
// explicitly create the return type of RedisClient
export type RedisClient = ReturnType<typeof createClient>
export const createRedisClient = async (
url?: string,
cert?: string
): Promise<RedisClient> => {
const redisClient = createClient({
url,
socket: {
tls: url?.startsWith('rediss://'), // rediss:// is the protocol for TLS
cert: cert?.replace(/\\n/g, '\n'), // replace \n with new line
rejectUnauthorized: false, // for self-signed certs
connectTimeout: 10000, // 10 seconds
reconnectStrategy(retries: number): number | Error {
if (retries > 10) {
return new Error('Retries exhausted')
}
return 1000
},
},
})
redisClient.on('error', (err) => console.error('Redis Client Error', err))
await redisClient.connect()
console.log('Redis Client Connected:', url)
return redisClient
}

View File

@@ -0,0 +1,31 @@
import axios from 'axios'
import { parseHTML } from 'linkedom'
import { ContentHandler, PreHandleResult } from '../content-handler'
export class AppleNewsHandler extends ContentHandler {
constructor() {
super()
this.name = 'Apple News'
}
shouldPreHandle(url: string): boolean {
const u = new URL(url)
return u.hostname === 'apple.news'
}
async preHandle(url: string): Promise<PreHandleResult> {
const MOBILE_USER_AGENT =
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36'
const response = await axios.get(url, {
headers: { 'User-Agent': MOBILE_USER_AGENT },
})
const data = response.data as string
const dom = parseHTML(data).document
// make sure it's a valid URL by wrapping in new URL
const href = dom
.querySelector('span.click-here')
?.parentElement?.getAttribute('href')
const u = href ? new URL(href) : undefined
return { url: u?.href }
}
}

View File

@@ -0,0 +1,86 @@
import axios from 'axios'
import { parseHTML } from 'linkedom'
import { ContentHandler, PreHandleResult } from '../content-handler'
/**
* Some of the content on Ars Technica is split over several pages.
* If this is the case we should unfurl the entire article into one. l
*/
export class ArsTechnicaHandler extends ContentHandler {
constructor() {
super()
this.name = 'ArsTechnica'
}
shouldPreHandle(url: string): boolean {
const u = new URL(url)
return u.hostname.endsWith('arstechnica.com')
}
hasMultiplePages(document: Document): boolean {
return document.querySelectorAll('nav.page-numbers')?.length != 0
}
async grabContentFromUrl(url: string): Promise<Document> {
const response = await axios.get(url)
const data = response.data as string
return parseHTML(data).document
}
async extractArticleContentsFromLink(url: string): Promise<Document[]> {
const dom = await this.grabContentFromUrl(url)
const articleContent = dom.querySelector('[itemprop="articleBody"]')
return [].slice.call(articleContent?.childNodes || [])
}
async expandLinksAndCombine(document: Document): Promise<Document> {
const pageNumbers = document.querySelector('nav.page-numbers')
const articleBody = document.querySelector('[itemprop="articleBody"]')
if (!pageNumbers || !articleBody) {
// We shouldn't ever really get here, but sometimes weird things happen.
return document
}
const pageLinkNodes = pageNumbers.querySelectorAll('a')
// Remove the "Next" Link, as it will duplicate some content.
const pageLinks =
Array.from(pageLinkNodes)
?.slice(0, pageLinkNodes.length - 1)
?.map(({ href }) => href) ?? []
const pageContents = await Promise.all(
pageLinks.map(this.extractArticleContentsFromLink.bind(this))
)
for (const articleContents of pageContents) {
// We place all the content in a span to indicate that a page has been parsed.
const span = document.createElement('SPAN')
span.className = 'nextPageContents'
span.append(...articleContents)
articleBody.append(span)
}
pageNumbers.remove()
return document
}
async preHandle(url: string): Promise<PreHandleResult> {
// We simply retrieve the article without Javascript enabled using a GET command.
const dom = await this.grabContentFromUrl(url)
if (!this.hasMultiplePages(dom)) {
return {
content: dom.body.outerHTML,
title: dom.title,
dom,
}
}
const expandedDom = await this.expandLinksAndCombine(dom)
return {
content: expandedDom.body.outerHTML,
title: dom.title,
dom: expandedDom,
}
}
}

View File

@@ -0,0 +1,41 @@
import axios from 'axios'
import { parseHTML } from 'linkedom'
import { ContentHandler, PreHandleResult } from '../content-handler'
export class BloombergHandler extends ContentHandler {
constructor() {
super()
this.name = 'Bloomberg'
}
shouldPreHandle(url: string): boolean {
const BLOOMBERG_URL_MATCH =
/https?:\/\/(www\.)?bloomberg.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_+.~#?&/=]*)/
return BLOOMBERG_URL_MATCH.test(url.toString())
}
async preHandle(url: string): Promise<PreHandleResult> {
console.log('prehandling bloomberg url', url)
try {
const response = await axios.get('https://app.scrapingbee.com/api/v1', {
params: {
api_key: process.env.SCRAPINGBEE_API_KEY,
url: url,
return_page_source: true,
block_ads: true,
block_resources: false,
},
})
const dom = parseHTML(response.data).document
return {
title: dom.title,
content: dom.querySelector('body')?.innerHTML,
url: url,
}
} catch (error) {
console.error('error prehandling bloomberg url', error)
throw error
}
}
}

View File

@@ -0,0 +1,34 @@
import { ContentHandler, PreHandleResult } from '../content-handler'
import axios from 'axios'
import { parseHTML } from 'linkedom'
export class DerstandardHandler extends ContentHandler {
constructor() {
super()
this.name = 'Derstandard'
}
shouldPreHandle(url: string): boolean {
const u = new URL(url)
return u.hostname === 'www.derstandard.at'
}
async preHandle(url: string): Promise<PreHandleResult> {
const response = await axios.get(url, {
// set cookie to give consent to get the article
headers: {
cookie: `DSGVO_ZUSAGE_V1=true; consentUUID=2bacb9c1-1e80-4be0-9f7b-ee987cf4e7b0_6`,
},
})
const content = response.data as string
const dom = parseHTML(content).document
const titleElement = dom.querySelector('.article-title')
titleElement && titleElement.remove()
return {
content: dom.body.outerHTML,
title: titleElement?.textContent || undefined,
}
}
}

View File

@@ -0,0 +1,44 @@
import { ContentHandler } from '../content-handler'
export class GitHubHandler extends ContentHandler {
constructor() {
super()
this.name = 'github'
}
shouldPreParse(url: string, dom: Document): boolean {
return new URL(url).hostname.endsWith('github.com')
}
async preParse(url: string, dom: Document): Promise<Document> {
const body = dom.querySelector('body')
const article = dom.querySelector('article')
const twitterTitle = dom.querySelector(`meta[name='twitter:title']`)
const linkAuthor = dom.querySelector(`span[itemprop='author']`)
if (body && article) {
body.replaceChildren(article)
// Attempt to set the author also. This is available on repo homepages
// but not on things like PRs. Ideally we want PRs and issues to have
// author set to the author of the PR/issue.
if (linkAuthor && linkAuthor.textContent) {
const author = dom.createElement('span')
author.setAttribute('rel', 'author')
author.innerHTML = linkAuthor.textContent
article.appendChild(author)
}
}
// Remove the GitHub - and repo org from the title
const twitterTitleContent = twitterTitle?.getAttribute('content')
if (twitterTitle && twitterTitleContent) {
twitterTitle.setAttribute(
'content',
twitterTitleContent.replace(/GitHub - (.*?)\//, '')
)
}
return Promise.resolve(dom)
}
}

View File

@@ -0,0 +1,33 @@
import { ContentHandler, PreHandleResult } from '../content-handler'
export class ImageHandler extends ContentHandler {
constructor() {
super()
this.name = 'Image'
}
shouldPreHandle(url: string): boolean {
const IMAGE_URL_PATTERN = /(https?:\/\/.*\.(?:jpg|jpeg|png|webp))/i
return IMAGE_URL_PATTERN.test(url.toString())
}
async preHandle(url: string): Promise<PreHandleResult> {
const title = url.toString().split('/').pop() || 'Image'
const content = `
<html>
<head>
<title>${title}</title>
<meta property="og:image" content="${url}" />
<meta property="og:title" content="${title}" />
<meta property="og:type" content="image" />
</head>
<body>
<div>
<img src="${url}" alt="${title}">
</div>
</body>
</html>`
return Promise.resolve({ title, content })
}
}

View File

@@ -0,0 +1,26 @@
import { ContentHandler, PreHandleResult } from '../content-handler'
export class MediumHandler extends ContentHandler {
constructor() {
super()
this.name = 'Medium'
}
shouldPreHandle(url: string): boolean {
const u = new URL(url)
return u.hostname.endsWith('medium.com')
}
async preHandle(url: string): Promise<PreHandleResult> {
console.log('prehandling medium url', url)
try {
const res = new URL(url)
res.searchParams.delete('source')
return Promise.resolve({ url: res.toString() })
} catch (error) {
console.error('error prehandling medium url', error)
throw error
}
}
}

View File

@@ -0,0 +1,417 @@
import axios from 'axios'
import { parseHTML } from 'linkedom'
import _, { truncate } from 'lodash'
import { DateTime } from 'luxon'
import { ContentHandler, PreHandleResult } from '../content-handler'
import { createRedisClient, RedisClient } from '../redis'
interface Tweet {
url: string
author: {
username: string
name: string
profileImageUrl: string
}
text: string
entities: {
urls: {
url: string
displayUrl: string
}[]
}
attachments: {
type: string
url: string
previewUrl: string
}[]
createdAt: string
}
export class NitterHandler extends ContentHandler {
// matches twitter.com and nitter.net urls
URL_MATCH =
/((twitter\.com)|(nitter\.net))\/(?:#!\/)?(\w+)\/status(?:es)?\/(\d+)(?:\/.*)?/
INSTANCES = [
{ value: 'https://nitter.moomoo.me', score: 0 },
{ value: 'https://nitter.net', score: 1 }, // the official instance
{ value: 'https://nitter.lacontrevoie.fr', score: 2 },
{ value: 'https://nitter.kavin.rocks', score: 3 },
{ value: 'https://notabird.site', score: 4 },
{ value: 'https://singapore.unofficialbird.com', score: 5 },
{ value: 'https://nitter.fly.dev', score: 6 },
]
REDIS_KEY = 'nitter-instances'
private instance: string
constructor() {
super()
this.name = 'Nitter'
this.instance = ''
}
async getInstances(redisClient: RedisClient) {
// get instances by score in ascending order
const instances = await redisClient.zRange(this.REDIS_KEY, '-inf', '+inf', {
BY: 'SCORE',
})
console.debug('instances', instances)
// if no instance is found, save the default instances
if (instances.length === 0) {
const result = await redisClient.zAdd(this.REDIS_KEY, this.INSTANCES, {
NX: true, // only add if the key does not exist
})
console.debug('add instances', result)
// expire the key after 1 day
const exp = await redisClient.expire(this.REDIS_KEY, 60 * 60 * 24)
console.debug('instances expire in 1 day', exp)
return this.INSTANCES.map((i) => i.value)
}
return instances
}
async incrementInstanceScore(
redisClient: RedisClient,
instance: string,
score = 1
) {
await redisClient.zIncrBy(this.REDIS_KEY, score, instance)
}
async getTweets(username: string, tweetId: string) {
function authorParser(header: Element) {
const profileImageUrl =
header.querySelector('.tweet-avatar img')?.getAttribute('src') ?? ''
const name =
header.querySelector('.fullname')?.getAttribute('title') ?? ''
const username =
header.querySelector('.username')?.getAttribute('title') ?? ''
return {
profileImageUrl,
name,
username: username.replace('@', ''), // remove @ from username
}
}
function dateParser(date: Element) {
const validDateTime =
date.getAttribute('title')?.replace(' · ', ' ') ?? Date.now()
return new Date(validDateTime).toISOString()
}
function urlParser(date: Element) {
return date.getAttribute('href') ?? ''
}
function attachmentParser(attachments: Element | null) {
if (!attachments) return []
const photos = Array.from(attachments.querySelectorAll('img')).map(
(i) => ({
url: i.getAttribute('src') ?? '',
type: 'photo',
previewUrl: i.getAttribute('src') ?? '',
})
)
const videos = Array.from(attachments.querySelectorAll('video')).map(
(i) => ({
url: i.getAttribute('data-url') ?? '',
type: 'video',
previewUrl: i.getAttribute('poster') ?? '',
})
)
return [...photos, ...videos]
}
function parseTweet(tweet: Element): Tweet | null {
const header = tweet.querySelector('.tweet-header')
if (!header) {
console.error('no header found', tweet)
return null
}
const author = authorParser(header)
const body = tweet.querySelector('.tweet-body')
if (!body) {
console.error('no body found', tweet)
return null
}
const tweetDateElement = body.querySelector('.tweet-date a')
if (!tweetDateElement) {
console.error('no tweet date found', tweet)
return null
}
const createdAt = dateParser(tweetDateElement)
const url = urlParser(tweetDateElement)
const content = body.querySelector('.tweet-content')
if (!content) {
console.error('no content found', tweet)
return null
}
const text = content.textContent ?? ''
const urls = Array.from(content.querySelectorAll('a')).map((a) => ({
url: a.getAttribute('href') ?? '',
displayUrl: a.textContent ?? '',
}))
const attachments = attachmentParser(body.querySelector('.attachments'))
return {
author,
createdAt,
text,
url,
entities: {
urls,
},
attachments,
}
}
const redisClient = await createRedisClient(
process.env.REDIS_URL,
process.env.REDIS_CERT
)
try {
const tweets: Tweet[] = []
const option = {
timeout: 20000, // 20 seconds
}
let html = ''
// get instances from redis
const instances = await this.getInstances(redisClient)
for (const instance of instances) {
try {
const url = `${instance}/${username}/status/${tweetId}`
const startTime = Date.now()
const response = await axios.get(url, option)
const latency = Math.floor(Date.now() - startTime)
console.debug('latency', latency)
html = response.data as string
this.instance = instance
await this.incrementInstanceScore(redisClient, instance, latency)
break
} catch (error) {
await this.incrementInstanceScore(
redisClient,
instance,
option.timeout
)
if (axios.isAxiosError(error)) {
console.info(`Error getting tweets from ${instance}`, error.message)
} else {
console.info(`Error getting tweets from ${instance}`, error)
}
}
}
if (!this.instance || !html) {
console.error('no instance or html found')
return []
}
const document = parseHTML(html).document
// get the main thread including tweets and threads
const mainThread = document.querySelector('.main-thread')
if (!mainThread) {
console.error('no main thread found')
return []
}
const timelineItems = Array.from(
mainThread.querySelectorAll('.timeline-item')
)
if (timelineItems.length === 0) {
console.error('no timeline items found')
return []
}
for (let i = 0; i < timelineItems.length; i++) {
const item = timelineItems[i]
const classList = item.classList
// skip unavailable tweets and earlier replies
if (
classList.contains('unavailable') ||
classList.contains('earlier-replies')
) {
console.info('skip unavailable tweets and earlier replies')
continue
}
// if there are more replies, get them
if (classList.contains('more-replies')) {
const newUrl = item.querySelector('a')?.getAttribute('href')
if (!newUrl) {
console.error('no new url', newUrl)
break
}
let html = ''
try {
// go to new url and wait for it to load
const response = await axios.get(
`${this.instance}${newUrl}`,
option
)
html = response.data as string
} catch (error) {
console.error('Error getting tweets', error)
break
}
const document = parseHTML(html).document
const nextThread = document.querySelector('.main-thread .after-tweet')
if (!nextThread) {
console.error('no next thread found')
break
}
// get the new timeline items and add them to the list
const newTimelineItems = Array.from(
nextThread.querySelectorAll('.timeline-item')
)
timelineItems.push(...newTimelineItems)
continue
}
const tweet = parseTweet(item)
// filter out replies
if (
tweet &&
tweet.author.username.toLowerCase() === username.toLowerCase()
) {
tweets.push(tweet)
}
}
return tweets
} catch (error) {
console.error('Error getting tweets', error)
return []
} finally {
await redisClient?.quit()
}
}
parseTweetUrl = (url: string) => {
const match = url.match(this.URL_MATCH)
return {
domain: match?.[1],
username: match?.[4],
tweetId: match?.[5],
}
}
titleForTweet = (author: { name: string }, text: string) => {
return `${author.name} on Twitter: ${truncate(text.replace(/http\S+/, ''), {
length: 100,
})}`
}
formatTimestamp = (timestamp: string) => {
return DateTime.fromJSDate(new Date(timestamp)).toLocaleString(
DateTime.DATETIME_FULL
)
}
shouldPreHandle(url: string): boolean {
return this.URL_MATCH.test(url.toString())
}
async preHandle(url: string): Promise<PreHandleResult> {
const { tweetId, username, domain } = this.parseTweetUrl(url)
if (!tweetId || !username || !domain) {
throw new Error('could not parse tweet url')
}
const tweets = await this.getTweets(username, tweetId)
if (tweets.length === 0) {
throw new Error('could not get tweets')
}
const tweet = tweets[0]
const author = tweet.author
// escape html entities in title
const title = this.titleForTweet(author, tweet.text)
const escapedTitle = _.escape(title)
const authorImage = `${this.instance}${author.profileImageUrl.replace(
'_normal',
'_400x400'
)}`
const description = _.escape(tweet.text) || escapedTitle
const imageDomain =
domain.toLowerCase() === 'twitter.com'
? 'https://pbs.twimg.com'
: 'https://nitter.net/pic'
let tweetsContent = ''
for (const tweet of tweets) {
let text = tweet.text
for (const urlObj of tweet.entities.urls) {
text = text.replace(
urlObj.displayUrl,
`<a href="${urlObj.url}">${urlObj.displayUrl}</a>`
)
}
const includesHtml = tweet.attachments
.map(
(attachment) =>
`<a class="media-link" href=${imageDomain}${decodeURIComponent(
attachment.url
).replace('/pic', '')}>
<picture>
<img class="tweet-img" src=${imageDomain}${decodeURIComponent(
attachment.previewUrl
).replace('/pic', '')} />
</picture>
</a>`
)
.join('\n')
tweetsContent += `<p class="_omnivore_tweet_content">${text}</p>${includesHtml}`
}
const tweetUrl = `
— <a href="https://${domain}/${author.username}">${
author.username
}</a> <span itemscope itemtype="https://schema.org/Person" itemprop="author">${
author.name
}</span> <a href="${url}">${this.formatTimestamp(tweet.createdAt)}</a>`
const content = `
<html>
<head>
<meta property="og:image" content="${authorImage}" />
<meta property="og:image:secure_url" content="${authorImage}" />
<meta property="og:title" content="${escapedTitle}" />
<meta property="og:description" content="${description}" />
<meta property="article:published_time" content="${tweet.createdAt}" />
<meta property="og:site_name" content="Twitter" />
<meta property="og:type" content="tweet" />
<meta property="dc:creator" content="${author.name}" />
<meta property="twitter:description" content="${description}" />
</head>
<body>
<div class="_omnivore_twitter">
${tweetsContent}
${tweetUrl}
</div>
</body>
</html>`
return { content, url, title }
}
}

View File

@@ -0,0 +1,18 @@
import { ContentHandler, PreHandleResult } from '../content-handler'
export class PdfHandler extends ContentHandler {
constructor() {
super()
this.name = 'PDF'
}
shouldPreHandle(url: string): boolean {
const u = new URL(url)
const path = u.pathname.replace(u.search, '')
return path.endsWith('.pdf')
}
async preHandle(url: string): Promise<PreHandleResult> {
return Promise.resolve({ contentType: 'application/pdf' })
}
}

View File

@@ -0,0 +1,83 @@
import axios from 'axios'
import _ from 'underscore'
import { ContentHandler, PreHandleResult } from '../content-handler'
export class PipedVideoHandler extends ContentHandler {
// https://piped.video/watch?v={videoId}
PIPED_URL_MATCH = /^((?:https?:)?\/\/)?piped\.video\/watch\?v=[^&]+/
constructor() {
super()
this.name = 'Piped-video'
}
getYoutubeVideoId = (url: string) => {
const u = new URL(url)
return u.searchParams.get('v')
}
escapeTitle = (title: string) => {
return _.escape(title)
}
shouldPreHandle(url: string): boolean {
return this.PIPED_URL_MATCH.test(url.toString())
}
async preHandle(url: string): Promise<PreHandleResult> {
const videoId = this.getYoutubeVideoId(url)
if (!videoId) {
return {}
}
const baseUrl = 'https://api-piped.mha.fi'
const apiUrl = `${baseUrl}/streams/${videoId}`
const metadata = (await axios.get(apiUrl)).data as {
title: string
thumbnailUrl: string
uploader: string
uploaderUrl: string
uploadDate: string
description: string
videoStreams: {
width: number
height: number
url: string
}[]
}
const videoStreams = metadata.videoStreams
if (!videoStreams || videoStreams.length == 0) {
return {}
}
const videoStream = videoStreams[0]
const src = `https://piped.mha.fi/embed/${videoId}`
// escape html entities in title
const title = metadata.title
const escapedTitle = this.escapeTitle(title)
const ratio = videoStream.width / videoStream.height
const thumbnail = metadata.thumbnailUrl
const height = 350
const width = height * ratio
const authorName = _.escape(metadata.uploader)
const content = `
<html>
<head>
<title>${escapedTitle}</title>
<meta property="og:image" content="${thumbnail}" />
<meta property="og:image:secure_url" content="${thumbnail}" />
<meta property="og:title" content="${escapedTitle}" />
<meta property="og:description" content="${metadata.description}" />
<meta property="og:article:author" content="${authorName}" />
<meta property="og:site_name" content="Piped Video" />
<meta property="article:published_time" content="${metadata.uploadDate}" />
<meta property="og:type" content="video" />
</head>
<body>
<iframe width="${width}" height="${height}" src="${src}" title="${escapedTitle}" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
<p><a href="${url}" target="_blank">${escapedTitle}</a></p>
<p itemscope="" itemprop="author" itemtype="http://schema.org/Person">By <a href="https://piped.video${metadata.uploaderUrl}" target="_blank">${authorName}</a></p>
</body>
</html>`
return { content, title }
}
}

View File

@@ -0,0 +1,38 @@
import { ContentHandler, PreHandleResult } from '../content-handler'
import axios from 'axios'
import { parseHTML } from 'linkedom'
export class ScrapingBeeHandler extends ContentHandler {
constructor() {
super()
this.name = 'ScrapingBee'
}
shouldPreHandle(url: string): boolean {
const u = new URL(url)
const hostnames = ['nytimes.com', 'news.google.com', 'fool.ca']
return hostnames.some((h) => u.hostname.endsWith(h))
}
async preHandle(url: string): Promise<PreHandleResult> {
console.log('prehandling url with scrapingbee', url)
try {
const response = await axios.get('https://app.scrapingbee.com/api/v1', {
params: {
api_key: process.env.SCRAPINGBEE_API_KEY,
url: url,
return_page_source: true,
block_ads: true,
block_resources: false,
},
})
const dom = parseHTML(response.data).document
return { title: dom.title, content: response.data as string, url: url }
} catch (error) {
console.error('error prehandling url w/scrapingbee', error)
throw error
}
}
}

View File

@@ -0,0 +1,121 @@
import { ContentHandler } from '../content-handler'
export class StackOverflowHandler extends ContentHandler {
constructor() {
super()
this.name = 'stackoverflow'
}
parseText(element: Element, title: string) {
const newText = element.ownerDocument.createElement('div')
const text = element.querySelector(`div[itemprop='text']`)
if (text) {
const votes = element
.querySelector(`div[itemprop='upvoteCount']`)
?.getAttribute('data-value')
if (votes) {
newText.innerHTML = `<h2>${title}: ${votes} vote${
votes === '1' ? '' : 's'
}</h2>${text.innerHTML}`
}
}
return newText
}
parseComments(element: Element) {
const dom = element.ownerDocument
const newComments = dom.createElement('div')
// comments
const commentsDiv = element.querySelector(`.comments`)
if (commentsDiv) {
const comments = commentsDiv.querySelectorAll(`.comment`)
if (comments.length > 0) {
newComments.innerHTML = `<h3>Comments</h3>`
comments.forEach((comment) => {
const author = comment.querySelector(`.comment-user`)
const text = comment.querySelector(`.comment-copy`)?.textContent
const authorHref = author?.getAttribute('href')
const date = comment.querySelector(`.relativetime-clean`)?.textContent
if (author && text && authorHref && date) {
const newComment = dom.createElement('p')
newComment.innerHTML = `<a href="${authorHref}"><b>${author.innerHTML}</b></a>: ${text} - ${date}`
newComments.appendChild(newComment)
}
})
}
}
return newComments
}
parseAuthors(element: Element) {
const dom = element.ownerDocument
const newAuthors = dom.createElement('div')
const authors = element.querySelectorAll(`.post-signature`)
authors.forEach((author) => {
const isOwner = author.classList.contains('owner')
const name = author.querySelector(`.user-details a`)?.textContent
const link = author.querySelector(`.user-details a`)?.getAttribute('href')
const reputation = author.querySelector(`.reputation-score`)?.textContent
const badges = Array.from(
author.querySelectorAll(`span[title*='badges']`)
)
.map((badge) => badge.getAttribute('title'))
.join(', ')
const date = author.querySelector(`.user-action-time`)?.textContent
if (name && link && reputation && date) {
const newAuthor = dom.createElement('p')
newAuthor.innerHTML = `<a href="${link}"><b>${name}</b></a> - ${reputation} reputation - ${
badges || 'no badge'
} - ${date}`
if (isOwner) {
const author = dom.createElement('span')
author.setAttribute('rel', 'author')
author.innerHTML = name
newAuthor.appendChild(author)
}
newAuthors.appendChild(newAuthor)
}
})
return newAuthors
}
shouldPreParse(url: string, dom: Document): boolean {
return new URL(url).hostname.endsWith('stackoverflow.com')
}
async preParse(url: string, dom: Document): Promise<Document> {
const mainEntity = dom.querySelector(`div[itemprop='mainEntity']`)
if (mainEntity) {
const newMainEntity = dom.createElement('div')
const question = mainEntity.querySelector('#question')
if (question) {
newMainEntity.appendChild(this.parseText(question, 'Question'))
newMainEntity.appendChild(this.parseAuthors(question))
newMainEntity.appendChild(this.parseComments(question))
}
const answersDiv = mainEntity.querySelector('#answers')
if (answersDiv) {
const answers = answersDiv.querySelectorAll(`.answer`)
answers.forEach((answer) => {
const title = answer.classList.contains('accepted-answer')
? 'Accepted Answer'
: 'Answer'
newMainEntity.appendChild(this.parseText(answer, title))
newMainEntity.appendChild(this.parseAuthors(answer))
newMainEntity.appendChild(this.parseComments(answer))
})
}
dom.body.replaceChildren(newMainEntity)
}
return Promise.resolve(dom)
}
}

View File

@@ -0,0 +1,26 @@
import { ContentHandler } from '../content-handler'
import axios from 'axios'
export class TDotCoHandler extends ContentHandler {
constructor() {
super()
this.name = 't.co'
}
shouldResolve(url: string): boolean {
const T_DOT_CO_URL_MATCH = /^https:\/\/(?:www\.)?t\.co\/.*$/
return T_DOT_CO_URL_MATCH.test(url)
}
async resolve(url: string) {
return axios
.get(url, { maxRedirects: 0, validateStatus: null })
.then((res) => {
return new URL(res.headers.location).href
})
.catch((err) => {
console.log('err with t.co url', err)
return undefined
})
}
}

View File

@@ -0,0 +1,59 @@
import axios from 'axios'
import { parseHTML } from 'linkedom'
import { ContentHandler, PreHandleResult } from '../content-handler'
export class TheAtlanticHandler extends ContentHandler {
constructor() {
super()
this.name = 'The Atlantic'
}
shouldPreHandle(url: string): boolean {
const u = new URL(url)
return u.hostname.endsWith('theatlantic.com')
}
removeRelatedContentLinks(articleContent: Element): Node[] {
const content = Array.from(articleContent.children)
return content.filter(
(paragraph) => !paragraph.className.startsWith('ArticleRelated')
)
}
unfurlContent(content: Document): Document {
const articleContentSection = content.querySelector(
'[data-event-module="article body"]'
)
// Remove the audio player.
content.querySelector('[data-event-module="audio player"]')?.remove()
if (!articleContentSection) {
return content
}
const articleContent = this.removeRelatedContentLinks(articleContentSection)
const divOverArticle = content.createElement('div')
divOverArticle.setAttribute('id', 'prehandled')
articleContent.forEach((it) => divOverArticle.appendChild(it))
content.insertBefore(divOverArticle, articleContentSection)
articleContentSection.remove()
return content
}
async preHandle(url: string): Promise<PreHandleResult> {
// We simply retrieve the article without Javascript enabled using a GET command.
const response = await axios.get(url)
const data = response.data as string
const dom = parseHTML(data).document
const editedDom = this.unfurlContent(dom)
return {
content: editedDom.body.outerHTML,
title: dom.title,
dom: editedDom,
}
}
}

View File

@@ -0,0 +1,388 @@
import axios from 'axios'
import { truncate } from 'lodash'
import { DateTime } from 'luxon'
import { Browser, BrowserContext } from 'puppeteer-core'
import _ from 'underscore'
import { ContentHandler, PreHandleResult } from '../content-handler'
interface TweetIncludes {
users: {
id: string
name: string
profile_image_url: string
username: string
}[]
media?: {
preview_image_url: string
type: string
url: string
media_key: string
}[]
}
interface TweetMeta {
result_count: number
}
interface TweetData {
author_id: string
text: string
entities: {
urls: {
url: string
expanded_url: string
display_url: string
}[]
}
created_at: string
referenced_tweets: {
type: string
id: string
}[]
conversation_id: string
attachments?: {
media_keys: string[]
}
}
interface Tweet {
data: TweetData
includes: TweetIncludes
}
interface Tweets {
data: TweetData[]
includes: TweetIncludes
meta: TweetMeta
}
const TWITTER_BEARER_TOKEN = process.env.TWITTER_BEARER_TOKEN
const TWITTER_URL_MATCH =
/twitter\.com\/(?:#!\/)?(\w+)\/status(?:es)?\/(\d+)(?:\/.*)?/
const MAX_THREAD_DEPTH = 100
const getTweetFields = () => {
const TWEET_FIELDS =
'&tweet.fields=attachments,author_id,conversation_id,created_at,' +
'entities,geo,in_reply_to_user_id,lang,possibly_sensitive,public_metrics,referenced_tweets,' +
'source,withheld'
const EXPANSIONS = '&expansions=author_id,attachments.media_keys'
const USER_FIELDS =
'&user.fields=created_at,description,entities,location,pinned_tweet_id,profile_image_url,protected,public_metrics,url,verified,withheld'
const MEDIA_FIELDS =
'&media.fields=duration_ms,height,preview_image_url,url,media_key,public_metrics,width'
return `${TWEET_FIELDS}${EXPANSIONS}${USER_FIELDS}${MEDIA_FIELDS}`
}
// unroll recent tweet thread
const getTweetThread = async (conversationId: string): Promise<Tweets> => {
const BASE_ENDPOINT = 'https://api.twitter.com/2/tweets/search/recent'
const apiUrl = new URL(
BASE_ENDPOINT +
'?query=' +
encodeURIComponent(`conversation_id:${conversationId}`) +
getTweetFields() +
`&max_results=${MAX_THREAD_DEPTH}`
)
if (!TWITTER_BEARER_TOKEN) {
throw new Error('No Twitter bearer token found')
}
const response = await axios.get<Tweets>(apiUrl.toString(), {
headers: {
Authorization: `Bearer ${TWITTER_BEARER_TOKEN}`,
redirect: 'follow',
},
})
return response.data
}
const getTweetById = async (id: string): Promise<Tweet> => {
const BASE_ENDPOINT = 'https://api.twitter.com/2/tweets/'
const apiUrl = new URL(BASE_ENDPOINT + id + '?' + getTweetFields())
if (!TWITTER_BEARER_TOKEN) {
throw new Error('No Twitter bearer token found')
}
const response = await axios.get<Tweet>(apiUrl.toString(), {
headers: {
Authorization: `Bearer ${TWITTER_BEARER_TOKEN}`,
redirect: 'follow',
},
})
return response.data
}
const getTweetsByIds = async (ids: string[]): Promise<Tweets> => {
const BASE_ENDPOINT = 'https://api.twitter.com/2/tweets?ids='
const apiUrl = new URL(BASE_ENDPOINT + ids.join() + getTweetFields())
if (!TWITTER_BEARER_TOKEN) {
throw new Error('No Twitter bearer token found')
}
const response = await axios.get<Tweets>(apiUrl.toString(), {
headers: {
Authorization: `Bearer ${TWITTER_BEARER_TOKEN}`,
redirect: 'follow',
},
})
return response.data
}
const titleForTweet = (author: { name: string }, text: string) => {
return `${author.name} on Twitter: ${truncate(text.replace(/http\S+/, ''), {
length: 100,
})}`
}
const tweetIdFromStatusUrl = (url: string): string | undefined => {
const match = url.toString().match(TWITTER_URL_MATCH)
return match?.[2]
}
const formatTimestamp = (timestamp: string) => {
return DateTime.fromJSDate(new Date(timestamp)).toLocaleString(
DateTime.DATETIME_FULL
)
}
const getTweetsFromResponse = (response: Tweets): Tweet[] => {
const tweets = []
for (const t of response.data) {
const media = response.includes.media?.filter((m) =>
t.attachments?.media_keys?.includes(m.media_key)
)
const tweet: Tweet = {
data: t,
includes: {
users: response.includes.users,
media,
},
}
tweets.push(tweet)
}
return tweets
}
const getOldTweets = async (
browser: Browser,
conversationId: string,
username: string
): Promise<Tweet[]> => {
const tweetIds = await getTweetIds(browser, conversationId, username)
if (tweetIds.length === 0) {
return []
}
const response = await getTweetsByIds(tweetIds)
return getTweetsFromResponse(response)
}
const getRecentTweets = async (conversationId: string): Promise<Tweet[]> => {
const thread = await getTweetThread(conversationId)
if (thread.meta.result_count === 0) {
return []
}
// tweets are in reverse chronological order in the thread
return getTweetsFromResponse(thread).reverse()
}
/**
* Wait for `ms` amount of milliseconds
* @param {number} ms
*/
const waitFor = (ms: number) =>
new Promise((resolve) => setTimeout(resolve, ms))
/**
* Get tweets(even older than 7 days) using puppeteer
* @param browser
* @param {string} tweetId
* @param {string} author
*/
const getTweetIds = async (
browser: Browser,
tweetId: string,
author: string
): Promise<string[]> => {
const pageURL = `https://twitter.com/${author}/status/${tweetId}`
let context: BrowserContext | undefined
try {
context = await browser.createIncognitoBrowserContext()
const page = await context.newPage()
// Modify this variable to control the size of viewport
const deviceScaleFactor = 0.2
const height = Math.floor(2000 / deviceScaleFactor)
const width = Math.floor(1700 / deviceScaleFactor)
await page.setViewport({ width, height, deviceScaleFactor })
await page.goto(pageURL, {
waitUntil: 'networkidle0',
timeout: 60000, // 60 seconds
})
return await page.evaluate(async (author) => {
/**
* Wait for `ms` amount of milliseconds
* @param {number} ms
*/
const waitFor = (ms: number) =>
new Promise((resolve) => setTimeout(resolve, ms))
const ids = []
// Find the first Show thread button and click it
const showRepliesButton = Array.from(
document.querySelectorAll('div[dir]')
)
.filter(
(node) => node.children[0] && node.children[0].tagName === 'SPAN'
)
.find((node) => node.children[0].innerHTML === 'Show replies')
if (showRepliesButton) {
;(showRepliesButton as HTMLElement).click()
await waitFor(2000)
}
const timeNodes = Array.from(document.querySelectorAll('time'))
for (const timeNode of timeNodes) {
/** @type {HTMLAnchorElement | HTMLSpanElement} */
const timeContainerAnchor: HTMLAnchorElement | HTMLSpanElement | null =
timeNode.parentElement
if (!timeContainerAnchor) continue
if (timeContainerAnchor.tagName === 'SPAN') continue
const href = timeContainerAnchor.getAttribute('href')
if (!href) continue
// Get the tweet id and username from the href: https://twitter.com/username/status/1234567890
const match = href.match(/\/([^/]+)\/status\/(\d+)/)
if (!match) continue
const id = match[2]
const username = match[1]
// skip non-author replies
username === author && ids.push(id)
}
return ids
}, author)
} catch (error) {
console.error('Error getting tweets', error)
return []
} finally {
if (context) {
await context.close()
}
}
}
export class TwitterHandler extends ContentHandler {
constructor() {
super()
this.name = 'Twitter'
}
shouldPreHandle(url: string): boolean {
return !!TWITTER_BEARER_TOKEN && TWITTER_URL_MATCH.test(url.toString())
}
async preHandle(url: string, browser: Browser): Promise<PreHandleResult> {
const tweetId = tweetIdFromStatusUrl(url)
if (!tweetId) {
throw new Error('could not find tweet id in url')
}
let tweet = await getTweetById(tweetId)
const conversationId = tweet.data.conversation_id
if (conversationId !== tweetId) {
// this is a reply, so we need to get the referenced tweet
tweet = await getTweetById(conversationId)
}
const tweetData = tweet.data
const authorId = tweetData.author_id
const author = tweet.includes.users.filter((u) => (u.id = authorId))[0]
// escape html entities in title
const title = titleForTweet(author, tweetData.text)
const escapedTitle = _.escape(title)
const authorImage = author.profile_image_url.replace('_normal', '_400x400')
const description = _.escape(tweetData.text)
// use puppeteer to get all tweet replies in the thread
const tweets = await getOldTweets(browser, conversationId, author.username)
let tweetsContent = ''
for (const tweet of tweets) {
const tweetData = tweet.data
let text = tweetData.text
if (tweetData.entities && tweetData.entities.urls) {
for (const urlObj of tweetData.entities.urls) {
text = text.replace(
urlObj.url,
`<a href="${urlObj.expanded_url}">${urlObj.display_url}</a>`
)
}
}
const includesHtml =
tweet.includes.media
?.map((m) => {
const linkUrl = m.type == 'photo' ? m.url : url
const previewUrl = m.type == 'photo' ? m.url : m.preview_image_url
return `<a class="media-link" href=${linkUrl}>
<picture>
<img class="tweet-img" src=${previewUrl} />
</picture>
</a>`
})
.join('\n') ?? ''
tweetsContent += `
<p>${text}</p>
${includesHtml}
`
}
const tweetUrl = `
— <a href="https://twitter.com/${author.username}">${
author.username
}</a> <span itemscope itemtype="https://schema.org/Person" itemprop="author">${
author.name
}</span> <a href="${url}">${formatTimestamp(tweetData.created_at)}</a>
`
const content = `
<html>
<head>
<meta property="og:image" content="${authorImage}" />
<meta property="og:image:secure_url" content="${authorImage}" />
<meta property="og:title" content="${escapedTitle}" />
<meta property="og:description" content="${description}" />
<meta property="article:published_time" content="${tweetData.created_at}" />
<meta property="og:site_name" content="Twitter" />
<meta property="og:type" content="tweet" />
</head>
<body>
<div>
${tweetsContent}
${tweetUrl}
</div>
</body>
</html>`
return { content, url, title }
}
}

View File

@@ -0,0 +1,48 @@
import { DateTime } from 'luxon'
import { ContentHandler } from '../content-handler'
export class WeixinQqHandler extends ContentHandler {
constructor() {
super()
this.name = 'Weixin QQ'
}
shouldPreParse(url: string, dom: Document): boolean {
return new URL(url).hostname.endsWith('weixin.qq.com')
}
async preParse(url: string, dom: Document): Promise<Document> {
// Retrieve the publish time
const publishTime = dom.querySelector('#publish_time')?.textContent
if (publishTime) {
const dateTimeFormat = 'yyyy-LL-dd HH:mm'
// published time is in UTC+8
const publishTimeISO = DateTime.fromFormat(publishTime, dateTimeFormat, {
zone: 'Asia/Shanghai',
}).toISO()
// create a meta node to store the publish time in ISO format
const metaNode = dom.createElement('meta')
metaNode.setAttribute('name', 'date')
metaNode.setAttribute('content', publishTimeISO)
dom.querySelector('head')?.appendChild(metaNode)
}
// This replace the class name of the article info to preserve the block
dom
.querySelector('.rich_media_meta_list')
?.setAttribute('class', '_omnivore_rich_media_meta_list')
// This removes the title
dom.querySelector('.rich_media_title')?.remove()
// This removes the profile info
dom.querySelector('.profile_container')?.remove()
// This removes the footer
dom.querySelector('#content_bottom_area')?.remove()
dom.querySelector('.rich_media_area_extra')?.remove()
dom.querySelector('#js_pc_qr_code')?.remove()
return Promise.resolve(dom)
}
}

View File

@@ -0,0 +1,24 @@
import { ContentHandler } from '../content-handler'
export class WikipediaHandler extends ContentHandler {
constructor() {
super()
this.name = 'wikipedia'
}
shouldPreParse(url: string, dom: Document): boolean {
return new URL(url).hostname.endsWith('wikipedia.org')
}
async preParse(url: string, dom: Document): Promise<Document> {
// This removes the [edit] anchors from wikipedia pages
dom.querySelectorAll('.mw-editsection').forEach((e) => e.remove())
// Remove footnotes
dom.querySelectorAll('sup[class="reference"]').forEach((e) => e.remove())
// this removes the sidebar
dom.querySelector('.infobox')?.remove()
return Promise.resolve(dom)
}
}

View File

@@ -0,0 +1,59 @@
import axios from 'axios'
import { parseHTML } from 'linkedom'
import { ContentHandler, PreHandleResult } from '../content-handler'
export class WiredHandler extends ContentHandler {
constructor() {
super()
this.name = 'Wired'
}
// We check if this is a paywalled document, as paywalled documents will have <p> tags
// in the body.
isPaywalledContent(document: Document): boolean {
return document.getElementsByClassName('paywall').length > 0
}
removeNonArticleNodes(document: Document): Document {
const genericCallouts = Array.from(
document.querySelectorAll('[data-testid="GenericCallout"]')
)
const ads = Array.from(document.querySelectorAll('.ad__slot')).map(
(it) => it.parentElement
)
const mostPopularArticles = Array.from(
document.querySelectorAll('[data-most-popular-id]')
)
;[...genericCallouts, ...ads, ...mostPopularArticles].forEach((it) =>
it?.remove()
)
return document
}
shouldPreHandle(url: string): boolean {
const u = new URL(url)
return u.hostname.endsWith('wired.com')
}
async preHandle(url: string): Promise<PreHandleResult> {
const response = await axios.get(url)
const data = response.data as string
const dom = parseHTML(data).document
if (!this.isPaywalledContent(dom)) {
// This is just to ensure that the currently working articles don't break.
// Looking further into this, they might all have paywalls?
return {}
}
const cleanedArticleDom = this.removeNonArticleNodes(dom)
return {
content: cleanedArticleDom.body.outerHTML,
title: dom.title,
dom: cleanedArticleDom,
}
}
}

View File

@@ -0,0 +1,97 @@
import { ContentHandler, PreHandleResult } from '../content-handler'
import axios from 'axios'
import _ from 'underscore'
const YOUTUBE_URL_MATCH =
/^((?:https?:)?\/\/)?((?:www|m)\.)?((?:youtube\.com|youtu.be))(\/(?:[\w-]+\?v=|embed\/|v\/)?)([\w-]+)(\S+)?$/
export const getYoutubeVideoId = (url: string) => {
const u = new URL(url)
const videoId = u.searchParams.get('v')
if (!videoId) {
const match = url.toString().match(YOUTUBE_URL_MATCH)
if (match === null || match.length < 6 || !match[5]) {
return undefined
}
return match[5]
}
return videoId
}
export const getYoutubePlaylistId = (url: string) => {
const u = new URL(url)
return u.searchParams.get('list')
}
export const escapeTitle = (title: string) => {
return _.escape(title)
}
export class YoutubeHandler extends ContentHandler {
constructor() {
super()
this.name = 'Youtube'
}
shouldPreHandle(url: string): boolean {
return YOUTUBE_URL_MATCH.test(url.toString())
}
async preHandle(url: string): Promise<PreHandleResult> {
const BaseUrl = 'https://www.youtube.com'
const embedBaseUrl = 'https://www.youtube.com/embed'
let urlToEncode: string
let src: string
const playlistId = getYoutubePlaylistId(url)
if (playlistId) {
urlToEncode = `${BaseUrl}/playlist?list=${playlistId}`
src = `${embedBaseUrl}/videoseries?list=${playlistId}`
} else {
const videoId = getYoutubeVideoId(url)
if (!videoId) {
return {}
}
urlToEncode = `${BaseUrl}/watch?v=${videoId}`
src = `${embedBaseUrl}/${videoId}`
}
const oembedUrl =
`https://www.youtube.com/oembed?format=json&url=` +
encodeURIComponent(urlToEncode)
const oembed = (await axios.get(oembedUrl.toString())).data as {
title: string
width: number
height: number
thumbnail_url: string
author_name: string
author_url: string
}
// escape html entities in title
const title = oembed.title
const escapedTitle = escapeTitle(title)
const ratio = oembed.width / oembed.height
const thumbnail = oembed.thumbnail_url
const height = 350
const width = height * ratio
const authorName = _.escape(oembed.author_name)
const content = `
<html>
<head><title>${escapedTitle}</title>
<meta property="og:image" content="${thumbnail}" />
<meta property="og:image:secure_url" content="${thumbnail}" />
<meta property="og:title" content="${escapedTitle}" />
<meta property="og:description" content="" />
<meta property="og:article:author" content="${authorName}" />
<meta property="og:site_name" content="YouTube" />
<meta property="og:type" content="video" />
</head>
<body>
<iframe width="${width}" height="${height}" src="${src}" title="${escapedTitle}" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
<p><a href="${url}" target="_blank">${escapedTitle}</a></p>
<p itemscope="" itemprop="author" itemtype="http://schema.org/Person">By <a href="${oembed.author_url}" target="_blank">${authorName}</a></p>
</body>
</html>`
return { content, title }
}
}

View File

@@ -0,0 +1,117 @@
import { ContentHandler } from '../content-handler'
export class ZhihuHandler extends ContentHandler {
constructor() {
super()
this.name = 'zhihu'
}
parseQuestion(element: Element) {
const newQuestion = element.ownerDocument.createElement('div')
const question = element.querySelector(`.QuestionHeader-main`)
if (question) {
const votes = element
.querySelector(`div[itemprop='upvoteCount']`)
?.getAttribute('data-value')
if (votes) {
newQuestion.innerHTML = `<h2>问题: ${votes} vote${
votes === '1' ? '' : 's'
}</h2>${question.innerHTML}`
}
}
return newQuestion
}
parseComments(element: Element) {
const dom = element.ownerDocument
const newComments = dom.createElement('div')
// comments
const commentsDiv = element.querySelector(`.comments`)
if (commentsDiv) {
const comments = commentsDiv.querySelectorAll(`.comment`)
if (comments.length > 0) {
newComments.innerHTML = `<h3>Comments</h3>`
comments.forEach((comment) => {
const author = comment.querySelector(`.comment-user`)
const text = comment.querySelector(`.comment-copy`)?.textContent
const authorHref = author?.getAttribute('href')
const date = comment.querySelector(`.relativetime-clean`)?.textContent
if (author && text && authorHref && date) {
const newComment = dom.createElement('p')
newComment.innerHTML = `<a href="${authorHref}"><b>${author.innerHTML}</b></a>: ${text} - ${date}`
newComments.appendChild(newComment)
}
})
}
}
return newComments
}
parseAuthors(element: Element) {
const dom = element.ownerDocument
const newAuthors = dom.createElement('div')
const authors = element.querySelectorAll(`.post-signature`)
authors.forEach((author) => {
const isOwner = author.classList.contains('owner')
const name = author.querySelector(`.user-details a`)?.textContent
const link = author.querySelector(`.user-details a`)?.getAttribute('href')
const reputation = author.querySelector(`.reputation-score`)?.textContent
const badges = Array.from(
author.querySelectorAll(`span[title*='badges']`)
)
.map((badge) => badge.getAttribute('title'))
.join(', ')
const date = author.querySelector(`.user-action-time`)?.textContent
if (name && link && reputation && date) {
const newAuthor = dom.createElement('p')
newAuthor.innerHTML = `<a href="${link}"><b>${name}</b></a> - ${reputation} reputation - ${
badges || 'no badge'
} - ${date}`
if (isOwner) {
const author = dom.createElement('span')
author.setAttribute('rel', 'author')
author.innerHTML = name
newAuthor.appendChild(author)
}
newAuthors.appendChild(newAuthor)
}
})
return newAuthors
}
shouldPreParse(url: string, dom: Document): boolean {
return new URL(url).hostname.endsWith('zhihu.com')
}
async preParse(url: string, dom: Document): Promise<Document> {
const mainEntity = dom.querySelector(`div[itemprop='mainEntity']`)
if (mainEntity) {
const newMainEntity = dom.createElement('div')
const question = mainEntity.querySelector('.QuestionHeader')
if (question) {
question.className = '_omnivore_zhihu_question'
newMainEntity.appendChild(question)
}
const answers = mainEntity.querySelectorAll('.ContentItem.AnswerItem')
answers.forEach((answer) => {
answer
.querySelector('.AuthorInfo')
?.setAttribute('class', '_omnivore_zhihu_author')
answer.className = '_omnivore_zhihu_answer'
newMainEntity.appendChild(answer)
})
dom.body.replaceChildren(newMainEntity)
}
return Promise.resolve(dom)
}
}