新增构建OpenSSL镜像相关文件
This commit is contained in:
189
examples/omnivore/api/content-handler/src/content-handler.ts
Normal file
189
examples/omnivore/api/content-handler/src/content-handler.ts
Normal file
@@ -0,0 +1,189 @@
|
||||
import addressparser from 'addressparser'
|
||||
import axios from 'axios'
|
||||
import { parseHTML } from 'linkedom'
|
||||
import { Browser } from 'puppeteer-core'
|
||||
import { v4 as uuid } from 'uuid'
|
||||
|
||||
interface Unsubscribe {
|
||||
mailTo?: string
|
||||
httpUrl?: string
|
||||
}
|
||||
|
||||
export interface NewsletterInput {
|
||||
from: string
|
||||
to: string
|
||||
subject: string
|
||||
html: string
|
||||
headers: Record<string, string | string[]>
|
||||
}
|
||||
|
||||
export interface NewsletterResult {
|
||||
email: string
|
||||
content: string
|
||||
url: string
|
||||
title: string
|
||||
author: string
|
||||
unsubMailTo?: string
|
||||
unsubHttpUrl?: string
|
||||
}
|
||||
|
||||
export interface PreHandleResult {
|
||||
url?: string
|
||||
title?: string
|
||||
content?: string
|
||||
contentType?: string
|
||||
dom?: Document
|
||||
}
|
||||
|
||||
export const FAKE_URL_PREFIX = 'https://omnivore.app/no_url?q='
|
||||
export const generateUniqueUrl = () => FAKE_URL_PREFIX + uuid()
|
||||
|
||||
export abstract class ContentHandler {
|
||||
protected senderRegex: RegExp
|
||||
protected urlRegex: RegExp
|
||||
name: string
|
||||
|
||||
protected constructor() {
|
||||
this.senderRegex = new RegExp(/NEWSLETTER_SENDER_REGEX/)
|
||||
this.urlRegex = new RegExp(/NEWSLETTER_URL_REGEX/)
|
||||
this.name = 'Handler name'
|
||||
}
|
||||
|
||||
shouldResolve(url: string): boolean {
|
||||
return false
|
||||
}
|
||||
|
||||
async resolve(url: string): Promise<string | undefined> {
|
||||
return Promise.resolve(url)
|
||||
}
|
||||
|
||||
shouldPreHandle(url: string): boolean {
|
||||
return false
|
||||
}
|
||||
|
||||
async preHandle(url: string, browser?: Browser): Promise<PreHandleResult> {
|
||||
return Promise.resolve({ url })
|
||||
}
|
||||
|
||||
shouldPreParse(url: string, dom: Document): boolean {
|
||||
return false
|
||||
}
|
||||
|
||||
async preParse(url: string, dom: Document): Promise<Document> {
|
||||
return Promise.resolve(dom)
|
||||
}
|
||||
|
||||
async isNewsletter(input: {
|
||||
from: string
|
||||
html: string
|
||||
headers: Record<string, string | string[]>
|
||||
dom: Document
|
||||
}): Promise<boolean> {
|
||||
const re = new RegExp(this.senderRegex)
|
||||
const postHeader = input.headers['list-post']
|
||||
const unSubHeader = input.headers['list-unsubscribe']
|
||||
return Promise.resolve(
|
||||
re.test(input.from) && (!!postHeader || !!unSubHeader)
|
||||
)
|
||||
}
|
||||
|
||||
findNewsletterHeaderHref(dom: Document): string | undefined {
|
||||
return undefined
|
||||
}
|
||||
|
||||
// Given an HTML blob tries to find a URL to use for
|
||||
// a canonical URL.
|
||||
async findNewsletterUrl(html: string): Promise<string | undefined> {
|
||||
const dom = parseHTML(html).document
|
||||
|
||||
// Check if this is a substack newsletter
|
||||
const href = this.findNewsletterHeaderHref(dom)
|
||||
if (href) {
|
||||
// Try to make a HEAD request, so we get the redirected URL, since these
|
||||
// will usually be behind tracking url redirects
|
||||
try {
|
||||
const response = await axios.head(href, { timeout: 5000 })
|
||||
return Promise.resolve(
|
||||
// eslint-disable-next-line @typescript-eslint/no-unsafe-member-access
|
||||
response.request.res.responseUrl as string | undefined
|
||||
)
|
||||
} catch (e) {
|
||||
console.log('error making HEAD request', e)
|
||||
return Promise.resolve(href)
|
||||
}
|
||||
}
|
||||
|
||||
return Promise.resolve(undefined)
|
||||
}
|
||||
|
||||
async parseNewsletterUrl(
|
||||
headers: Record<string, string | string[]>,
|
||||
html: string
|
||||
): Promise<string | undefined> {
|
||||
// get url from dom
|
||||
const url = await this.findNewsletterUrl(html)
|
||||
if (url) {
|
||||
return url
|
||||
}
|
||||
// get newsletter url from html
|
||||
const matches = html.match(this.urlRegex)
|
||||
if (matches) {
|
||||
return matches[1]
|
||||
}
|
||||
return undefined
|
||||
}
|
||||
|
||||
parseAuthor(from: string): string {
|
||||
// get author name from email
|
||||
// e.g. 'Jackson Harper from Omnivore App <jacksonh@substack.com>'
|
||||
// or 'Mike Allen <mike@axios.com>'
|
||||
const parsed = addressparser(from)
|
||||
if (parsed.length > 0 && parsed[0].name) {
|
||||
return parsed[0].name
|
||||
}
|
||||
return from
|
||||
}
|
||||
|
||||
parseUnsubscribe(unSubHeader: string): Unsubscribe {
|
||||
// parse list-unsubscribe header
|
||||
// e.g. List-Unsubscribe: <https://omnivore.com/unsub>, <mailto:unsub@omnivore.com>
|
||||
return {
|
||||
httpUrl: unSubHeader.match(/<(https?:\/\/[^>]*)>/)?.[1],
|
||||
mailTo: unSubHeader.match(/<mailto:([^>]*)>/)?.[1],
|
||||
}
|
||||
}
|
||||
|
||||
async handleNewsletter({
|
||||
from,
|
||||
to,
|
||||
subject,
|
||||
html,
|
||||
headers,
|
||||
}: NewsletterInput): Promise<NewsletterResult> {
|
||||
console.log('handleNewsletter', from, to, subject, headers)
|
||||
|
||||
if (!from || !html || !subject || !to) {
|
||||
console.log('invalid newsletter email')
|
||||
throw new Error('invalid newsletter email')
|
||||
}
|
||||
|
||||
// fallback to default url if newsletter url does not exist
|
||||
// assign a random uuid to the default url to avoid duplicate url
|
||||
const url =
|
||||
(await this.parseNewsletterUrl(headers, html)) || generateUniqueUrl()
|
||||
const author = this.parseAuthor(from)
|
||||
const unsubscribe = headers['list-unsubscribe']
|
||||
? this.parseUnsubscribe(headers['list-unsubscribe'].toString())
|
||||
: undefined
|
||||
|
||||
return {
|
||||
email: to,
|
||||
content: html,
|
||||
url,
|
||||
title: subject,
|
||||
author,
|
||||
unsubMailTo: unsubscribe?.mailTo || '',
|
||||
unsubHttpUrl: unsubscribe?.httpUrl || '',
|
||||
}
|
||||
}
|
||||
}
|
||||
186
examples/omnivore/api/content-handler/src/index.ts
Normal file
186
examples/omnivore/api/content-handler/src/index.ts
Normal file
@@ -0,0 +1,186 @@
|
||||
import { parseHTML } from 'linkedom'
|
||||
import { Browser } from 'puppeteer-core'
|
||||
import {
|
||||
ContentHandler,
|
||||
NewsletterInput,
|
||||
NewsletterResult,
|
||||
PreHandleResult,
|
||||
} from './content-handler'
|
||||
import { AxiosHandler } from './newsletters/axios-handler'
|
||||
import { BeehiivHandler } from './newsletters/beehiiv-handler'
|
||||
import { BloombergNewsletterHandler } from './newsletters/bloomberg-newsletter-handler'
|
||||
import { ConvertkitHandler } from './newsletters/convertkit-handler'
|
||||
import { CooperPressHandler } from './newsletters/cooper-press-handler'
|
||||
import { EnergyWorldHandler } from './newsletters/energy-world'
|
||||
import { EveryIoHandler } from './newsletters/every-io-handler'
|
||||
import { GenericHandler } from './newsletters/generic-handler'
|
||||
import { GhostHandler } from './newsletters/ghost-handler'
|
||||
import { GolangHandler } from './newsletters/golang-handler'
|
||||
import { HeyWorldHandler } from './newsletters/hey-world-handler'
|
||||
import { IndiaTimesHandler } from './newsletters/india-times-handler'
|
||||
import { MorningBrewHandler } from './newsletters/morning-brew-handler'
|
||||
import { RevueHandler } from './newsletters/revue-handler'
|
||||
import { SubstackHandler } from './newsletters/substack-handler'
|
||||
import { AppleNewsHandler } from './websites/apple-news-handler'
|
||||
import { ArsTechnicaHandler } from './websites/ars-technica-handler'
|
||||
import { BloombergHandler } from './websites/bloomberg-handler'
|
||||
import { DerstandardHandler } from './websites/derstandard-handler'
|
||||
import { GitHubHandler } from './websites/github-handler'
|
||||
import { ImageHandler } from './websites/image-handler'
|
||||
import { MediumHandler } from './websites/medium-handler'
|
||||
import { NitterHandler } from './websites/nitter-handler'
|
||||
import { PdfHandler } from './websites/pdf-handler'
|
||||
import { PipedVideoHandler } from './websites/piped-video-handler'
|
||||
import { ScrapingBeeHandler } from './websites/scrapingBee-handler'
|
||||
import { StackOverflowHandler } from './websites/stack-overflow-handler'
|
||||
import { TDotCoHandler } from './websites/t-dot-co-handler'
|
||||
import { TheAtlanticHandler } from './websites/the-atlantic-handler'
|
||||
import { WeixinQqHandler } from './websites/weixin-qq-handler'
|
||||
import { WikipediaHandler } from './websites/wikipedia-handler'
|
||||
import { YoutubeHandler } from './websites/youtube-handler'
|
||||
import { ZhihuHandler } from './websites/zhihu-handler'
|
||||
|
||||
const validateUrlString = (url: string): boolean => {
|
||||
const u = new URL(url)
|
||||
// Make sure the URL is http or https
|
||||
if (u.protocol !== 'http:' && u.protocol !== 'https:') {
|
||||
throw new Error('Invalid URL protocol check failed')
|
||||
}
|
||||
// Make sure the domain is not localhost
|
||||
if (u.hostname === 'localhost' || u.hostname === '0.0.0.0') {
|
||||
throw new Error('Invalid URL is localhost')
|
||||
}
|
||||
// Make sure the domain is not a private IP
|
||||
if (/^(10|172\.16|192\.168)\..*/.test(u.hostname)) {
|
||||
throw new Error('Invalid URL is private ip')
|
||||
}
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
const contentHandlers: ContentHandler[] = [
|
||||
new ArsTechnicaHandler(),
|
||||
new TheAtlanticHandler(),
|
||||
new AppleNewsHandler(),
|
||||
new BloombergHandler(),
|
||||
new DerstandardHandler(),
|
||||
new ImageHandler(),
|
||||
new MediumHandler(),
|
||||
new PdfHandler(),
|
||||
new ScrapingBeeHandler(),
|
||||
new TDotCoHandler(),
|
||||
new YoutubeHandler(),
|
||||
new WikipediaHandler(),
|
||||
new GitHubHandler(),
|
||||
new AxiosHandler(),
|
||||
new GolangHandler(),
|
||||
new MorningBrewHandler(),
|
||||
new BloombergNewsletterHandler(),
|
||||
new SubstackHandler(),
|
||||
new StackOverflowHandler(),
|
||||
new EnergyWorldHandler(),
|
||||
new PipedVideoHandler(),
|
||||
new WeixinQqHandler(),
|
||||
new NitterHandler(),
|
||||
new ZhihuHandler(),
|
||||
]
|
||||
|
||||
const newsletterHandlers: ContentHandler[] = [
|
||||
new AxiosHandler(),
|
||||
new BloombergNewsletterHandler(),
|
||||
new GolangHandler(),
|
||||
new SubstackHandler(),
|
||||
new MorningBrewHandler(),
|
||||
new BeehiivHandler(),
|
||||
new ConvertkitHandler(),
|
||||
new RevueHandler(),
|
||||
new GhostHandler(),
|
||||
new CooperPressHandler(),
|
||||
new HeyWorldHandler(),
|
||||
new GenericHandler(),
|
||||
new EveryIoHandler(),
|
||||
new EnergyWorldHandler(),
|
||||
new IndiaTimesHandler(),
|
||||
]
|
||||
|
||||
export const preHandleContent = async (
|
||||
url: string,
|
||||
browser: Browser
|
||||
): Promise<PreHandleResult | undefined> => {
|
||||
// Before we run the regular handlers we check to see if we need tp
|
||||
// pre-resolve the URL. TODO: This should probably happen recursively,
|
||||
// so URLs can be pre-resolved, handled, pre-resolved, handled, etc.
|
||||
for (const handler of contentHandlers) {
|
||||
if (handler.shouldResolve(url)) {
|
||||
try {
|
||||
const resolvedUrl = await handler.resolve(url)
|
||||
if (resolvedUrl && validateUrlString(resolvedUrl)) {
|
||||
url = resolvedUrl
|
||||
}
|
||||
} catch (err) {
|
||||
console.log('error resolving url with handler', handler.name, err)
|
||||
}
|
||||
break
|
||||
}
|
||||
}
|
||||
// Before we fetch the page we check the handlers, to see if they want
|
||||
// to perform a prefetch action that can modify our requests.
|
||||
// enumerate the handlers and see if any of them want to handle the request
|
||||
for (const handler of contentHandlers) {
|
||||
if (handler.shouldPreHandle(url)) {
|
||||
console.log('preHandleContent', handler.name, url)
|
||||
return handler.preHandle(url, browser)
|
||||
}
|
||||
}
|
||||
return undefined
|
||||
}
|
||||
|
||||
export const preParseContent = async (
|
||||
url: string,
|
||||
dom: Document
|
||||
): Promise<Document | undefined> => {
|
||||
// Before we parse the page we check the handlers, to see if they want
|
||||
// to perform a preParse action that can modify our dom.
|
||||
// enumerate the handlers and see if any of them want to handle the dom
|
||||
for (const handler of contentHandlers) {
|
||||
if (handler.shouldPreParse(url, dom)) {
|
||||
console.log('preParseContent', handler.name, url)
|
||||
return handler.preParse(url, dom)
|
||||
}
|
||||
}
|
||||
return undefined
|
||||
}
|
||||
|
||||
export const getNewsletterHandler = async (input: {
|
||||
from: string
|
||||
html: string
|
||||
headers: Record<string, string | string[]>
|
||||
}): Promise<ContentHandler | undefined> => {
|
||||
const dom = parseHTML(input.html).document
|
||||
for (const handler of newsletterHandlers) {
|
||||
if (await handler.isNewsletter({ ...input, dom })) {
|
||||
return handler
|
||||
}
|
||||
}
|
||||
|
||||
return undefined
|
||||
}
|
||||
|
||||
export const handleNewsletter = async (
|
||||
input: NewsletterInput
|
||||
): Promise<NewsletterResult | undefined> => {
|
||||
const handler = await getNewsletterHandler(input)
|
||||
if (handler) {
|
||||
console.log('handleNewsletter', handler.name, input.subject)
|
||||
return handler.handleNewsletter(input)
|
||||
}
|
||||
|
||||
return undefined
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
preHandleContent,
|
||||
handleNewsletter,
|
||||
preParseContent,
|
||||
getNewsletterHandler,
|
||||
}
|
||||
@@ -0,0 +1,46 @@
|
||||
import { ContentHandler } from '../content-handler'
|
||||
|
||||
export class AxiosHandler extends ContentHandler {
|
||||
constructor() {
|
||||
super()
|
||||
this.senderRegex = /<.+@axios.com>/
|
||||
this.urlRegex = /View in browser at <a.*>(.*)<\/a>/
|
||||
this.name = 'axios'
|
||||
}
|
||||
|
||||
shouldPreParse(url: string, dom: Document): boolean {
|
||||
const host = this.name + '.com'
|
||||
// check if url ends with axios.com
|
||||
return new URL(url).hostname.endsWith(host)
|
||||
}
|
||||
|
||||
async preParse(url: string, dom: Document): Promise<Document> {
|
||||
const body = dom.querySelector('table')
|
||||
|
||||
let isFooter = false
|
||||
// this removes ads and replaces table with a div
|
||||
body?.querySelectorAll('table').forEach((el) => {
|
||||
// remove the footer and the ads
|
||||
if (!el.textContent || el.textContent.length < 20 || isFooter) {
|
||||
el.remove()
|
||||
} else {
|
||||
// removes the first few rows of the table (the header)
|
||||
// remove the last two rows of the table (they are ads)
|
||||
el.querySelectorAll('tr').forEach((tr, i) => {
|
||||
if (i <= 7 || i >= el.querySelectorAll('tr').length - 2) {
|
||||
console.log('removing', tr)
|
||||
tr.remove()
|
||||
}
|
||||
})
|
||||
// replace the table with a div
|
||||
const div = dom.createElement('div')
|
||||
div.innerHTML = el.innerHTML
|
||||
el.parentNode?.replaceChild(div, el)
|
||||
// set the isFooter flag to true because the next table is the footer
|
||||
isFooter = true
|
||||
}
|
||||
})
|
||||
|
||||
return Promise.resolve(dom)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,24 @@
|
||||
import { ContentHandler } from '../content-handler'
|
||||
|
||||
export class BeehiivHandler extends ContentHandler {
|
||||
constructor() {
|
||||
super()
|
||||
this.name = 'beehiiv'
|
||||
}
|
||||
|
||||
async isNewsletter(input: {
|
||||
from: string
|
||||
headers: Record<string, string | string[]>
|
||||
}): Promise<boolean> {
|
||||
return Promise.resolve(
|
||||
input.headers['x-beehiiv-type']?.toString() === 'newsletter'
|
||||
)
|
||||
}
|
||||
|
||||
async parseNewsletterUrl(
|
||||
headers: Record<string, string | string[]>,
|
||||
html: string
|
||||
): Promise<string | undefined> {
|
||||
return Promise.resolve(headers['x-newsletter']?.toString())
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,37 @@
|
||||
import { ContentHandler } from '../content-handler'
|
||||
|
||||
export class BloombergNewsletterHandler extends ContentHandler {
|
||||
constructor() {
|
||||
super()
|
||||
this.senderRegex = /<.+@mail.bloomberg.*.com>/
|
||||
this.urlRegex = /<a class="view-in-browser__url" href=["']([^"']*)["']/
|
||||
this.name = 'bloomberg'
|
||||
}
|
||||
|
||||
shouldPreParse(url: string, dom: Document): boolean {
|
||||
const host = this.name + '.com'
|
||||
// check if url ends with bloomberg.com
|
||||
return (
|
||||
new URL(url).hostname.endsWith(host) ||
|
||||
dom.querySelector('.logo-image')?.getAttribute('alt')?.toLowerCase() ===
|
||||
this.name
|
||||
)
|
||||
}
|
||||
|
||||
async preParse(url: string, dom: Document): Promise<Document> {
|
||||
const body = dom.querySelector('.wrapper')
|
||||
|
||||
// this removes header
|
||||
body?.querySelector('.sailthru-variables')?.remove()
|
||||
body?.querySelector('.preview-text')?.remove()
|
||||
body?.querySelector('.logo-wrapper')?.remove()
|
||||
body?.querySelector('.by-the-number-wrapper')?.remove()
|
||||
// this removes footer
|
||||
body?.querySelector('.quote-box-wrapper')?.remove()
|
||||
body?.querySelector('.header-wrapper')?.remove()
|
||||
body?.querySelector('.component-wrapper')?.remove()
|
||||
body?.querySelector('.footer')?.remove()
|
||||
|
||||
return Promise.resolve(dom)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,52 @@
|
||||
import { ContentHandler } from '../content-handler'
|
||||
|
||||
export class ConvertkitHandler extends ContentHandler {
|
||||
constructor() {
|
||||
super()
|
||||
this.name = 'convertkit'
|
||||
}
|
||||
|
||||
findNewsletterHeaderHref(dom: Document): string | undefined {
|
||||
const readOnline = dom.querySelectorAll('a')
|
||||
let res: string | undefined = undefined
|
||||
readOnline.forEach((e) => {
|
||||
if (
|
||||
e.textContent === 'View this email in your browser' ||
|
||||
e.textContent === 'Read on FS'
|
||||
) {
|
||||
res = e.getAttribute('href') || undefined
|
||||
}
|
||||
})
|
||||
return res
|
||||
}
|
||||
|
||||
async isNewsletter(input: {
|
||||
from: string
|
||||
dom: Document
|
||||
headers: Record<string, string | string[]>
|
||||
}): Promise<boolean> {
|
||||
const dom = input.dom
|
||||
const icons = dom.querySelectorAll(
|
||||
'img[src*="convertkit.com"], img[src*="convertkit-mail"]'
|
||||
)
|
||||
if (icons.length === 0) {
|
||||
return Promise.resolve(false)
|
||||
}
|
||||
// ignore newsletters that have a confirmation link to the newsletter in the body
|
||||
const links = dom.querySelectorAll(
|
||||
'a[href*="convertkit.com"], a[href*="convertkit-mail"]'
|
||||
)
|
||||
const isConfirmation = Array.from(links).some((e) => {
|
||||
return e.textContent === 'Confirm your subscription'
|
||||
})
|
||||
|
||||
return Promise.resolve(!isConfirmation)
|
||||
}
|
||||
|
||||
async parseNewsletterUrl(
|
||||
headers: Record<string, string | string[]>,
|
||||
html: string
|
||||
): Promise<string | undefined> {
|
||||
return this.findNewsletterUrl(html)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,37 @@
|
||||
import { ContentHandler } from '../content-handler'
|
||||
|
||||
export class CooperPressHandler extends ContentHandler {
|
||||
constructor() {
|
||||
super()
|
||||
this.name = 'cooper-press'
|
||||
}
|
||||
|
||||
findNewsletterHeaderHref(dom: Document): string | undefined {
|
||||
const readOnline = dom.querySelectorAll('a')
|
||||
let res: string | undefined = undefined
|
||||
readOnline.forEach((e) => {
|
||||
if (e.textContent === 'Read on the Web') {
|
||||
res = e.getAttribute('href') || undefined
|
||||
}
|
||||
})
|
||||
return res
|
||||
}
|
||||
|
||||
async isNewsletter(input: {
|
||||
from: string
|
||||
dom: Document
|
||||
headers: Record<string, string | string[]>
|
||||
}): Promise<boolean> {
|
||||
const dom = input.dom
|
||||
return Promise.resolve(
|
||||
dom.querySelectorAll('a[href*="cooperpress.com"]').length > 0
|
||||
)
|
||||
}
|
||||
|
||||
async parseNewsletterUrl(
|
||||
headers: Record<string, string | string[]>,
|
||||
html: string
|
||||
): Promise<string | undefined> {
|
||||
return this.findNewsletterUrl(html)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,44 @@
|
||||
import { ContentHandler } from '../content-handler'
|
||||
|
||||
export class EnergyWorldHandler extends ContentHandler {
|
||||
constructor() {
|
||||
super()
|
||||
this.name = 'Energy World'
|
||||
}
|
||||
|
||||
async isNewsletter(input: {
|
||||
from: string
|
||||
html: string
|
||||
headers: Record<string, string | string[]>
|
||||
dom: Document
|
||||
}): Promise<boolean> {
|
||||
return Promise.resolve(
|
||||
input.from === 'ETEnergyworld Latest News<newsletter@etenergyworld.com>'
|
||||
)
|
||||
}
|
||||
|
||||
shouldPreParse(url: string, dom: Document): boolean {
|
||||
return dom.querySelectorAll('img[src*="etenergyworld.png"]').length > 0
|
||||
}
|
||||
|
||||
async preParse(url: string, dom: Document): Promise<Document> {
|
||||
// get the main content
|
||||
const main = dom.querySelector('table[class="nletter-wrap"]')
|
||||
if (!main) {
|
||||
return Promise.resolve(dom)
|
||||
}
|
||||
|
||||
// create a new dom
|
||||
const newDom = dom.createDocumentFragment()
|
||||
|
||||
// add the content to the new dom
|
||||
main.querySelectorAll('table[class="multi-cols"] tr').forEach((tr) => {
|
||||
const p = dom.createElement('p')
|
||||
p.innerHTML = tr.innerHTML
|
||||
newDom.appendChild(p)
|
||||
})
|
||||
dom.body.replaceChildren(newDom)
|
||||
|
||||
return Promise.resolve(dom)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,22 @@
|
||||
import { ContentHandler } from '../content-handler'
|
||||
|
||||
export class EveryIoHandler extends ContentHandler {
|
||||
constructor() {
|
||||
super()
|
||||
this.name = 'Every.io'
|
||||
}
|
||||
|
||||
async isNewsletter(input: {
|
||||
from: string
|
||||
html: string
|
||||
headers: Record<string, string | string[]>
|
||||
dom: Document
|
||||
}): Promise<boolean> {
|
||||
return Promise.resolve(input.from === 'Every <hello@every.to>')
|
||||
}
|
||||
|
||||
findNewsletterHeaderHref(dom: Document): string | undefined {
|
||||
const readOnline = dom.querySelector('.newsletter-email .title a')
|
||||
return readOnline?.getAttribute('href') || undefined
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,49 @@
|
||||
import { ContentHandler } from '../content-handler'
|
||||
import addressparser from 'addressparser'
|
||||
|
||||
export class GenericHandler extends ContentHandler {
|
||||
// newsletter url text regex for newsletters that don't have a newsletter header
|
||||
NEWSLETTER_URL_TEXT_REGEX =
|
||||
/((View|Read)(.*)(email|post)?(.*)(in your browser|online|on (FS|the Web))|Lire en ligne)/i
|
||||
|
||||
constructor() {
|
||||
super()
|
||||
this.name = 'Generic Newsletter'
|
||||
}
|
||||
|
||||
async isNewsletter(input: {
|
||||
from: string
|
||||
html: string
|
||||
headers: Record<string, string | string[]>
|
||||
dom: Document
|
||||
}): Promise<boolean> {
|
||||
const postHeader = input.headers['list-post'] || input.headers['list-id']
|
||||
const unSubHeader = input.headers['list-unsubscribe']
|
||||
return Promise.resolve(!!postHeader || !!unSubHeader)
|
||||
}
|
||||
|
||||
findNewsletterHeaderHref(dom: Document): string | undefined {
|
||||
const readOnline = dom.querySelectorAll('a')
|
||||
let res: string | undefined = undefined
|
||||
readOnline.forEach((e) => {
|
||||
if (e.textContent && this.NEWSLETTER_URL_TEXT_REGEX.test(e.textContent)) {
|
||||
res = e.getAttribute('href') || undefined
|
||||
}
|
||||
})
|
||||
return res
|
||||
}
|
||||
|
||||
async parseNewsletterUrl(
|
||||
headers: Record<string, string | string[]>,
|
||||
html: string
|
||||
): Promise<string | undefined> {
|
||||
// raw SubStack newsletter url is like <https://hongbo130.substack.com/p/tldr>
|
||||
// we need to get the real url from the raw url
|
||||
const postHeader = headers['list-post']?.toString()
|
||||
if (postHeader && addressparser(postHeader).length > 0) {
|
||||
return addressparser(postHeader)[0].name
|
||||
}
|
||||
|
||||
return this.findNewsletterUrl(html)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,31 @@
|
||||
import { ContentHandler } from '../content-handler'
|
||||
|
||||
export class GhostHandler extends ContentHandler {
|
||||
constructor() {
|
||||
super()
|
||||
this.name = 'ghost'
|
||||
}
|
||||
|
||||
findNewsletterHeaderHref(dom: Document): string | undefined {
|
||||
const readOnline = dom.querySelector('.view-online-link')
|
||||
return readOnline?.getAttribute('href') || undefined
|
||||
}
|
||||
|
||||
async isNewsletter(input: {
|
||||
from: string
|
||||
dom: Document
|
||||
headers: Record<string, string | string[]>
|
||||
}): Promise<boolean> {
|
||||
const dom = input.dom
|
||||
return Promise.resolve(
|
||||
dom.querySelectorAll('img[src*="ghost.org"]').length > 0
|
||||
)
|
||||
}
|
||||
|
||||
async parseNewsletterUrl(
|
||||
headers: Record<string, string | string[]>,
|
||||
html: string
|
||||
): Promise<string | undefined> {
|
||||
return this.findNewsletterUrl(html)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,27 @@
|
||||
import { ContentHandler } from '../content-handler'
|
||||
|
||||
export class GolangHandler extends ContentHandler {
|
||||
constructor() {
|
||||
super()
|
||||
this.senderRegex = /<.+@golangweekly.com>/
|
||||
this.urlRegex = /<a href=["']([^"']*)["'].*>Read on the Web<\/a>/
|
||||
this.name = 'golangweekly'
|
||||
}
|
||||
|
||||
shouldPreParse(url: string, dom: Document): boolean {
|
||||
const host = this.name + '.com'
|
||||
// check if url ends with golangweekly.com
|
||||
return new URL(url).hostname.endsWith(host)
|
||||
}
|
||||
|
||||
async preParse(url: string, dom: Document): Promise<Document> {
|
||||
const body = dom.querySelector('body')
|
||||
|
||||
// this removes the "Subscribe" button
|
||||
body?.querySelector('.el-splitbar')?.remove()
|
||||
// this removes the title
|
||||
body?.querySelector('.el-masthead')?.remove()
|
||||
|
||||
return Promise.resolve(dom)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,27 @@
|
||||
import { ContentHandler } from '../content-handler'
|
||||
|
||||
export class HeyWorldHandler extends ContentHandler {
|
||||
constructor() {
|
||||
super()
|
||||
this.name = 'hey-world'
|
||||
this.senderRegex = /<.+@world.hey.com>/
|
||||
}
|
||||
|
||||
findNewsletterHeaderHref(dom: Document): string | undefined {
|
||||
const readOnline = dom.querySelectorAll('a')
|
||||
let res: string | undefined = undefined
|
||||
readOnline.forEach((e) => {
|
||||
if (e.textContent === 'View this post online') {
|
||||
res = e.getAttribute('href') || undefined
|
||||
}
|
||||
})
|
||||
return res
|
||||
}
|
||||
|
||||
async parseNewsletterUrl(
|
||||
headers: Record<string, string | string[]>,
|
||||
html: string
|
||||
): Promise<string | undefined> {
|
||||
return this.findNewsletterUrl(html)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,33 @@
|
||||
import { ContentHandler } from '../content-handler'
|
||||
import addressparser from 'addressparser'
|
||||
|
||||
export class IndiaTimesHandler extends ContentHandler {
|
||||
constructor() {
|
||||
super()
|
||||
this.name = 'India Times'
|
||||
}
|
||||
|
||||
async isNewsletter(input: {
|
||||
from: string
|
||||
html: string
|
||||
headers: Record<string, string | string[]>
|
||||
dom: Document
|
||||
}): Promise<boolean> {
|
||||
return Promise.resolve(
|
||||
addressparser(input.from).some(
|
||||
(e) => e.address === 'newsletters@timesofindia.com'
|
||||
)
|
||||
)
|
||||
}
|
||||
|
||||
findNewsletterHeaderHref(dom: Document): string | undefined {
|
||||
const readOnline = dom.querySelectorAll('a')
|
||||
let res: string | undefined = undefined
|
||||
readOnline.forEach((e) => {
|
||||
if (e.textContent === 'view in browser') {
|
||||
res = e.getAttribute('href') || undefined
|
||||
}
|
||||
})
|
||||
return res
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,35 @@
|
||||
import { ContentHandler } from '../content-handler'
|
||||
|
||||
export class MorningBrewHandler extends ContentHandler {
|
||||
constructor() {
|
||||
super()
|
||||
this.senderRegex = /Morning Brew <crew@morningbrew.com>/
|
||||
this.urlRegex = /<a.* href=["']([^"']*)["'].*>View Online<\/a>/
|
||||
this.name = 'morningbrew'
|
||||
}
|
||||
|
||||
shouldPreParse(url: string, dom: Document): boolean {
|
||||
const host = this.name + '.com'
|
||||
// check if url ends with morningbrew.com
|
||||
return new URL(url).hostname.endsWith(host)
|
||||
}
|
||||
|
||||
async preParse(url: string, dom: Document): Promise<Document> {
|
||||
// retain the width of the cells in the table of market info
|
||||
dom.querySelectorAll('.markets-arrow-cell').forEach((td) => {
|
||||
const table = td.closest('table')
|
||||
if (table) {
|
||||
const bubbleTable = table.querySelector('.markets-bubble')
|
||||
if (bubbleTable) {
|
||||
// replace the nested table with the text
|
||||
const e = bubbleTable.querySelector('.markets-table-text')
|
||||
e && bubbleTable.parentNode?.replaceChild(e, bubbleTable)
|
||||
}
|
||||
// set custom class for the table
|
||||
table.className = 'morning-brew-markets'
|
||||
}
|
||||
})
|
||||
|
||||
return Promise.resolve(dom)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,44 @@
|
||||
import { ContentHandler } from '../content-handler'
|
||||
|
||||
export class RevueHandler extends ContentHandler {
|
||||
constructor() {
|
||||
super()
|
||||
this.name = 'revue'
|
||||
}
|
||||
|
||||
findNewsletterHeaderHref(dom: Document): string | undefined {
|
||||
const viewOnline = dom.querySelectorAll('table tr td a[target="_blank"]')
|
||||
let res: string | undefined = undefined
|
||||
viewOnline.forEach((e) => {
|
||||
if (e.textContent === 'View online') {
|
||||
res = e.getAttribute('href') || undefined
|
||||
}
|
||||
})
|
||||
return res
|
||||
}
|
||||
|
||||
async isNewsletter(input: {
|
||||
from: string
|
||||
dom: Document
|
||||
headers: Record<string, string | string[]>
|
||||
}): Promise<boolean> {
|
||||
const dom = input.dom
|
||||
if (
|
||||
dom.querySelectorAll('img[src*="getrevue.co"], img[src*="revue.email"]')
|
||||
.length > 0
|
||||
) {
|
||||
const getrevueUrl = this.findNewsletterHeaderHref(dom)
|
||||
if (getrevueUrl) {
|
||||
return Promise.resolve(true)
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
async parseNewsletterUrl(
|
||||
headers: Record<string, string | string[]>,
|
||||
html: string
|
||||
): Promise<string | undefined> {
|
||||
return this.findNewsletterUrl(html)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,139 @@
|
||||
import addressparser from 'addressparser'
|
||||
import { ContentHandler } from '../content-handler'
|
||||
|
||||
export class SubstackHandler extends ContentHandler {
|
||||
constructor() {
|
||||
super()
|
||||
this.name = 'substack'
|
||||
}
|
||||
|
||||
shouldPreParse(url: string, dom: Document): boolean {
|
||||
const host = this.name + '.com'
|
||||
const cdnHost = 'substackcdn.com'
|
||||
// check if url ends with substack.com
|
||||
// or has a profile image hosted at substack.com or substackcdn.com
|
||||
return (
|
||||
new URL(url).hostname.endsWith(host) ||
|
||||
!!dom
|
||||
.querySelector('.email-body img')
|
||||
?.getAttribute('src')
|
||||
?.includes(host || cdnHost)
|
||||
)
|
||||
}
|
||||
|
||||
async preParse(url: string, dom: Document): Promise<Document> {
|
||||
const body = dom.querySelector('.email-body-container')
|
||||
|
||||
// this removes header and profile avatar
|
||||
body?.querySelector('.header')?.remove()
|
||||
body?.querySelector('.preamble')?.remove()
|
||||
body?.querySelector('.meta-author-wrap')?.remove()
|
||||
// this removes meta button
|
||||
body?.querySelector('.post-meta')?.remove()
|
||||
// this removes footer
|
||||
body?.querySelector('.post-cta')?.remove()
|
||||
body?.querySelector('.container-border')?.remove()
|
||||
body?.querySelector('.footer')?.remove()
|
||||
// this removes the "restack" button
|
||||
body?.querySelector('.email-ufi-2-bottom')?.remove()
|
||||
// this removes the "share" button
|
||||
body?.querySelector('.email-ufi-2-top')?.remove()
|
||||
|
||||
dom = this.fixupStaticTweets(dom)
|
||||
|
||||
return Promise.resolve(dom)
|
||||
}
|
||||
|
||||
findNewsletterHeaderHref(dom: Document): string | undefined {
|
||||
// Substack header links
|
||||
const postLink = dom.querySelector('h1 a')
|
||||
if (postLink) {
|
||||
return postLink.getAttribute('href') || undefined
|
||||
}
|
||||
|
||||
return undefined
|
||||
}
|
||||
|
||||
async isNewsletter({
|
||||
headers,
|
||||
dom,
|
||||
}: {
|
||||
from: string
|
||||
headers: Record<string, string | string[]>
|
||||
dom: Document
|
||||
}): Promise<boolean> {
|
||||
if (headers['list-post']) {
|
||||
return Promise.resolve(true)
|
||||
}
|
||||
// substack newsletter emails have tables with a *post-meta class
|
||||
if (dom.querySelector('table[class$="post-meta"]')) {
|
||||
return true
|
||||
}
|
||||
// If the article has a header link, and substack icons its probably a newsletter
|
||||
const href = this.findNewsletterHeaderHref(dom)
|
||||
const oldHeartIcon = dom.querySelector(
|
||||
'table tbody td span a img[src*="HeartIcon"]'
|
||||
)
|
||||
const oldRecommendIcon = dom.querySelector(
|
||||
'table tbody td span a img[src*="RecommendIconRounded"]'
|
||||
)
|
||||
const heartIcon = dom.querySelector('a img[src*="LucideHeart"]')
|
||||
const commentsIcon = dom.querySelector('a img[src*="LucideComments"]')
|
||||
return Promise.resolve(
|
||||
!!(
|
||||
href &&
|
||||
(oldHeartIcon || oldRecommendIcon || heartIcon || commentsIcon)
|
||||
)
|
||||
)
|
||||
}
|
||||
|
||||
async parseNewsletterUrl(
|
||||
headers: Record<string, string | string[]>,
|
||||
html: string
|
||||
): Promise<string | undefined> {
|
||||
// raw SubStack newsletter url is like <https://hongbo130.substack.com/p/tldr>
|
||||
// we need to get the real url from the raw url
|
||||
const postHeader = headers['list-post']?.toString()
|
||||
if (postHeader && addressparser(postHeader).length > 0) {
|
||||
return Promise.resolve(addressparser(postHeader)[0].name)
|
||||
}
|
||||
return this.findNewsletterUrl(html)
|
||||
}
|
||||
|
||||
fixupStaticTweets(dom: Document): Document {
|
||||
const preClassName = '_omnivore-static-'
|
||||
const staticTweets = dom.querySelectorAll('div[class="tweet static"]')
|
||||
|
||||
if (staticTweets.length < 1) {
|
||||
return dom
|
||||
}
|
||||
|
||||
const recurse = (node: Element, f: (node: Element) => void) => {
|
||||
for (let i = 0; i < node.children.length; i++) {
|
||||
const child = node.children[i]
|
||||
recurse(child, f)
|
||||
f(child)
|
||||
}
|
||||
}
|
||||
|
||||
for (const tweet of Array.from(staticTweets)) {
|
||||
tweet.className = preClassName + 'tweet'
|
||||
tweet.removeAttribute('style')
|
||||
|
||||
// get all children, rename their class, remove style
|
||||
// elements (style will be handled in the reader)
|
||||
recurse(tweet, (n: Element) => {
|
||||
const className = n.className
|
||||
if (
|
||||
className.startsWith('tweet-') ||
|
||||
className.startsWith('quote-tweet')
|
||||
) {
|
||||
n.className = preClassName + className
|
||||
}
|
||||
n.removeAttribute('style')
|
||||
})
|
||||
}
|
||||
|
||||
return dom
|
||||
}
|
||||
}
|
||||
32
examples/omnivore/api/content-handler/src/redis.ts
Normal file
32
examples/omnivore/api/content-handler/src/redis.ts
Normal file
@@ -0,0 +1,32 @@
|
||||
import { createClient } from 'redis'
|
||||
|
||||
// explicitly create the return type of RedisClient
|
||||
export type RedisClient = ReturnType<typeof createClient>
|
||||
|
||||
export const createRedisClient = async (
|
||||
url?: string,
|
||||
cert?: string
|
||||
): Promise<RedisClient> => {
|
||||
const redisClient = createClient({
|
||||
url,
|
||||
socket: {
|
||||
tls: url?.startsWith('rediss://'), // rediss:// is the protocol for TLS
|
||||
cert: cert?.replace(/\\n/g, '\n'), // replace \n with new line
|
||||
rejectUnauthorized: false, // for self-signed certs
|
||||
connectTimeout: 10000, // 10 seconds
|
||||
reconnectStrategy(retries: number): number | Error {
|
||||
if (retries > 10) {
|
||||
return new Error('Retries exhausted')
|
||||
}
|
||||
return 1000
|
||||
},
|
||||
},
|
||||
})
|
||||
|
||||
redisClient.on('error', (err) => console.error('Redis Client Error', err))
|
||||
|
||||
await redisClient.connect()
|
||||
console.log('Redis Client Connected:', url)
|
||||
|
||||
return redisClient
|
||||
}
|
||||
@@ -0,0 +1,31 @@
|
||||
import axios from 'axios'
|
||||
import { parseHTML } from 'linkedom'
|
||||
import { ContentHandler, PreHandleResult } from '../content-handler'
|
||||
|
||||
export class AppleNewsHandler extends ContentHandler {
|
||||
constructor() {
|
||||
super()
|
||||
this.name = 'Apple News'
|
||||
}
|
||||
|
||||
shouldPreHandle(url: string): boolean {
|
||||
const u = new URL(url)
|
||||
return u.hostname === 'apple.news'
|
||||
}
|
||||
|
||||
async preHandle(url: string): Promise<PreHandleResult> {
|
||||
const MOBILE_USER_AGENT =
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36'
|
||||
const response = await axios.get(url, {
|
||||
headers: { 'User-Agent': MOBILE_USER_AGENT },
|
||||
})
|
||||
const data = response.data as string
|
||||
const dom = parseHTML(data).document
|
||||
// make sure it's a valid URL by wrapping in new URL
|
||||
const href = dom
|
||||
.querySelector('span.click-here')
|
||||
?.parentElement?.getAttribute('href')
|
||||
const u = href ? new URL(href) : undefined
|
||||
return { url: u?.href }
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,86 @@
|
||||
import axios from 'axios'
|
||||
import { parseHTML } from 'linkedom'
|
||||
import { ContentHandler, PreHandleResult } from '../content-handler'
|
||||
|
||||
/**
|
||||
* Some of the content on Ars Technica is split over several pages.
|
||||
* If this is the case we should unfurl the entire article into one. l
|
||||
*/
|
||||
export class ArsTechnicaHandler extends ContentHandler {
|
||||
constructor() {
|
||||
super()
|
||||
this.name = 'ArsTechnica'
|
||||
}
|
||||
|
||||
shouldPreHandle(url: string): boolean {
|
||||
const u = new URL(url)
|
||||
return u.hostname.endsWith('arstechnica.com')
|
||||
}
|
||||
|
||||
hasMultiplePages(document: Document): boolean {
|
||||
return document.querySelectorAll('nav.page-numbers')?.length != 0
|
||||
}
|
||||
|
||||
async grabContentFromUrl(url: string): Promise<Document> {
|
||||
const response = await axios.get(url)
|
||||
const data = response.data as string
|
||||
return parseHTML(data).document
|
||||
}
|
||||
|
||||
async extractArticleContentsFromLink(url: string): Promise<Document[]> {
|
||||
const dom = await this.grabContentFromUrl(url)
|
||||
const articleContent = dom.querySelector('[itemprop="articleBody"]')
|
||||
return [].slice.call(articleContent?.childNodes || [])
|
||||
}
|
||||
|
||||
async expandLinksAndCombine(document: Document): Promise<Document> {
|
||||
const pageNumbers = document.querySelector('nav.page-numbers')
|
||||
const articleBody = document.querySelector('[itemprop="articleBody"]')
|
||||
|
||||
if (!pageNumbers || !articleBody) {
|
||||
// We shouldn't ever really get here, but sometimes weird things happen.
|
||||
return document
|
||||
}
|
||||
|
||||
const pageLinkNodes = pageNumbers.querySelectorAll('a')
|
||||
// Remove the "Next" Link, as it will duplicate some content.
|
||||
const pageLinks =
|
||||
Array.from(pageLinkNodes)
|
||||
?.slice(0, pageLinkNodes.length - 1)
|
||||
?.map(({ href }) => href) ?? []
|
||||
|
||||
const pageContents = await Promise.all(
|
||||
pageLinks.map(this.extractArticleContentsFromLink.bind(this))
|
||||
)
|
||||
|
||||
for (const articleContents of pageContents) {
|
||||
// We place all the content in a span to indicate that a page has been parsed.
|
||||
const span = document.createElement('SPAN')
|
||||
span.className = 'nextPageContents'
|
||||
span.append(...articleContents)
|
||||
articleBody.append(span)
|
||||
}
|
||||
pageNumbers.remove()
|
||||
|
||||
return document
|
||||
}
|
||||
|
||||
async preHandle(url: string): Promise<PreHandleResult> {
|
||||
// We simply retrieve the article without Javascript enabled using a GET command.
|
||||
const dom = await this.grabContentFromUrl(url)
|
||||
if (!this.hasMultiplePages(dom)) {
|
||||
return {
|
||||
content: dom.body.outerHTML,
|
||||
title: dom.title,
|
||||
dom,
|
||||
}
|
||||
}
|
||||
|
||||
const expandedDom = await this.expandLinksAndCombine(dom)
|
||||
return {
|
||||
content: expandedDom.body.outerHTML,
|
||||
title: dom.title,
|
||||
dom: expandedDom,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,41 @@
|
||||
import axios from 'axios'
|
||||
import { parseHTML } from 'linkedom'
|
||||
import { ContentHandler, PreHandleResult } from '../content-handler'
|
||||
|
||||
export class BloombergHandler extends ContentHandler {
|
||||
constructor() {
|
||||
super()
|
||||
this.name = 'Bloomberg'
|
||||
}
|
||||
|
||||
shouldPreHandle(url: string): boolean {
|
||||
const BLOOMBERG_URL_MATCH =
|
||||
/https?:\/\/(www\.)?bloomberg.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_+.~#?&/=]*)/
|
||||
return BLOOMBERG_URL_MATCH.test(url.toString())
|
||||
}
|
||||
|
||||
async preHandle(url: string): Promise<PreHandleResult> {
|
||||
console.log('prehandling bloomberg url', url)
|
||||
|
||||
try {
|
||||
const response = await axios.get('https://app.scrapingbee.com/api/v1', {
|
||||
params: {
|
||||
api_key: process.env.SCRAPINGBEE_API_KEY,
|
||||
url: url,
|
||||
return_page_source: true,
|
||||
block_ads: true,
|
||||
block_resources: false,
|
||||
},
|
||||
})
|
||||
const dom = parseHTML(response.data).document
|
||||
return {
|
||||
title: dom.title,
|
||||
content: dom.querySelector('body')?.innerHTML,
|
||||
url: url,
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('error prehandling bloomberg url', error)
|
||||
throw error
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,34 @@
|
||||
import { ContentHandler, PreHandleResult } from '../content-handler'
|
||||
import axios from 'axios'
|
||||
import { parseHTML } from 'linkedom'
|
||||
|
||||
export class DerstandardHandler extends ContentHandler {
|
||||
constructor() {
|
||||
super()
|
||||
this.name = 'Derstandard'
|
||||
}
|
||||
|
||||
shouldPreHandle(url: string): boolean {
|
||||
const u = new URL(url)
|
||||
return u.hostname === 'www.derstandard.at'
|
||||
}
|
||||
|
||||
async preHandle(url: string): Promise<PreHandleResult> {
|
||||
const response = await axios.get(url, {
|
||||
// set cookie to give consent to get the article
|
||||
headers: {
|
||||
cookie: `DSGVO_ZUSAGE_V1=true; consentUUID=2bacb9c1-1e80-4be0-9f7b-ee987cf4e7b0_6`,
|
||||
},
|
||||
})
|
||||
const content = response.data as string
|
||||
|
||||
const dom = parseHTML(content).document
|
||||
const titleElement = dom.querySelector('.article-title')
|
||||
titleElement && titleElement.remove()
|
||||
|
||||
return {
|
||||
content: dom.body.outerHTML,
|
||||
title: titleElement?.textContent || undefined,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,44 @@
|
||||
import { ContentHandler } from '../content-handler'
|
||||
|
||||
export class GitHubHandler extends ContentHandler {
|
||||
constructor() {
|
||||
super()
|
||||
this.name = 'github'
|
||||
}
|
||||
|
||||
shouldPreParse(url: string, dom: Document): boolean {
|
||||
return new URL(url).hostname.endsWith('github.com')
|
||||
}
|
||||
|
||||
async preParse(url: string, dom: Document): Promise<Document> {
|
||||
const body = dom.querySelector('body')
|
||||
const article = dom.querySelector('article')
|
||||
const twitterTitle = dom.querySelector(`meta[name='twitter:title']`)
|
||||
const linkAuthor = dom.querySelector(`span[itemprop='author']`)
|
||||
|
||||
if (body && article) {
|
||||
body.replaceChildren(article)
|
||||
|
||||
// Attempt to set the author also. This is available on repo homepages
|
||||
// but not on things like PRs. Ideally we want PRs and issues to have
|
||||
// author set to the author of the PR/issue.
|
||||
if (linkAuthor && linkAuthor.textContent) {
|
||||
const author = dom.createElement('span')
|
||||
author.setAttribute('rel', 'author')
|
||||
author.innerHTML = linkAuthor.textContent
|
||||
article.appendChild(author)
|
||||
}
|
||||
}
|
||||
|
||||
// Remove the GitHub - and repo org from the title
|
||||
const twitterTitleContent = twitterTitle?.getAttribute('content')
|
||||
if (twitterTitle && twitterTitleContent) {
|
||||
twitterTitle.setAttribute(
|
||||
'content',
|
||||
twitterTitleContent.replace(/GitHub - (.*?)\//, '')
|
||||
)
|
||||
}
|
||||
|
||||
return Promise.resolve(dom)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,33 @@
|
||||
import { ContentHandler, PreHandleResult } from '../content-handler'
|
||||
|
||||
export class ImageHandler extends ContentHandler {
|
||||
constructor() {
|
||||
super()
|
||||
this.name = 'Image'
|
||||
}
|
||||
|
||||
shouldPreHandle(url: string): boolean {
|
||||
const IMAGE_URL_PATTERN = /(https?:\/\/.*\.(?:jpg|jpeg|png|webp))/i
|
||||
return IMAGE_URL_PATTERN.test(url.toString())
|
||||
}
|
||||
|
||||
async preHandle(url: string): Promise<PreHandleResult> {
|
||||
const title = url.toString().split('/').pop() || 'Image'
|
||||
const content = `
|
||||
<html>
|
||||
<head>
|
||||
<title>${title}</title>
|
||||
<meta property="og:image" content="${url}" />
|
||||
<meta property="og:title" content="${title}" />
|
||||
<meta property="og:type" content="image" />
|
||||
</head>
|
||||
<body>
|
||||
<div>
|
||||
<img src="${url}" alt="${title}">
|
||||
</div>
|
||||
</body>
|
||||
</html>`
|
||||
|
||||
return Promise.resolve({ title, content })
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,26 @@
|
||||
import { ContentHandler, PreHandleResult } from '../content-handler'
|
||||
|
||||
export class MediumHandler extends ContentHandler {
|
||||
constructor() {
|
||||
super()
|
||||
this.name = 'Medium'
|
||||
}
|
||||
|
||||
shouldPreHandle(url: string): boolean {
|
||||
const u = new URL(url)
|
||||
return u.hostname.endsWith('medium.com')
|
||||
}
|
||||
|
||||
async preHandle(url: string): Promise<PreHandleResult> {
|
||||
console.log('prehandling medium url', url)
|
||||
|
||||
try {
|
||||
const res = new URL(url)
|
||||
res.searchParams.delete('source')
|
||||
return Promise.resolve({ url: res.toString() })
|
||||
} catch (error) {
|
||||
console.error('error prehandling medium url', error)
|
||||
throw error
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,417 @@
|
||||
import axios from 'axios'
|
||||
import { parseHTML } from 'linkedom'
|
||||
import _, { truncate } from 'lodash'
|
||||
import { DateTime } from 'luxon'
|
||||
import { ContentHandler, PreHandleResult } from '../content-handler'
|
||||
import { createRedisClient, RedisClient } from '../redis'
|
||||
|
||||
interface Tweet {
|
||||
url: string
|
||||
author: {
|
||||
username: string
|
||||
name: string
|
||||
profileImageUrl: string
|
||||
}
|
||||
text: string
|
||||
entities: {
|
||||
urls: {
|
||||
url: string
|
||||
displayUrl: string
|
||||
}[]
|
||||
}
|
||||
attachments: {
|
||||
type: string
|
||||
url: string
|
||||
previewUrl: string
|
||||
}[]
|
||||
createdAt: string
|
||||
}
|
||||
|
||||
export class NitterHandler extends ContentHandler {
|
||||
// matches twitter.com and nitter.net urls
|
||||
URL_MATCH =
|
||||
/((twitter\.com)|(nitter\.net))\/(?:#!\/)?(\w+)\/status(?:es)?\/(\d+)(?:\/.*)?/
|
||||
INSTANCES = [
|
||||
{ value: 'https://nitter.moomoo.me', score: 0 },
|
||||
{ value: 'https://nitter.net', score: 1 }, // the official instance
|
||||
{ value: 'https://nitter.lacontrevoie.fr', score: 2 },
|
||||
{ value: 'https://nitter.kavin.rocks', score: 3 },
|
||||
{ value: 'https://notabird.site', score: 4 },
|
||||
{ value: 'https://singapore.unofficialbird.com', score: 5 },
|
||||
{ value: 'https://nitter.fly.dev', score: 6 },
|
||||
]
|
||||
REDIS_KEY = 'nitter-instances'
|
||||
|
||||
private instance: string
|
||||
|
||||
constructor() {
|
||||
super()
|
||||
this.name = 'Nitter'
|
||||
this.instance = ''
|
||||
}
|
||||
|
||||
async getInstances(redisClient: RedisClient) {
|
||||
// get instances by score in ascending order
|
||||
const instances = await redisClient.zRange(this.REDIS_KEY, '-inf', '+inf', {
|
||||
BY: 'SCORE',
|
||||
})
|
||||
console.debug('instances', instances)
|
||||
|
||||
// if no instance is found, save the default instances
|
||||
if (instances.length === 0) {
|
||||
const result = await redisClient.zAdd(this.REDIS_KEY, this.INSTANCES, {
|
||||
NX: true, // only add if the key does not exist
|
||||
})
|
||||
console.debug('add instances', result)
|
||||
|
||||
// expire the key after 1 day
|
||||
const exp = await redisClient.expire(this.REDIS_KEY, 60 * 60 * 24)
|
||||
console.debug('instances expire in 1 day', exp)
|
||||
|
||||
return this.INSTANCES.map((i) => i.value)
|
||||
}
|
||||
|
||||
return instances
|
||||
}
|
||||
|
||||
async incrementInstanceScore(
|
||||
redisClient: RedisClient,
|
||||
instance: string,
|
||||
score = 1
|
||||
) {
|
||||
await redisClient.zIncrBy(this.REDIS_KEY, score, instance)
|
||||
}
|
||||
|
||||
async getTweets(username: string, tweetId: string) {
|
||||
function authorParser(header: Element) {
|
||||
const profileImageUrl =
|
||||
header.querySelector('.tweet-avatar img')?.getAttribute('src') ?? ''
|
||||
const name =
|
||||
header.querySelector('.fullname')?.getAttribute('title') ?? ''
|
||||
const username =
|
||||
header.querySelector('.username')?.getAttribute('title') ?? ''
|
||||
|
||||
return {
|
||||
profileImageUrl,
|
||||
name,
|
||||
username: username.replace('@', ''), // remove @ from username
|
||||
}
|
||||
}
|
||||
|
||||
function dateParser(date: Element) {
|
||||
const validDateTime =
|
||||
date.getAttribute('title')?.replace(' · ', ' ') ?? Date.now()
|
||||
|
||||
return new Date(validDateTime).toISOString()
|
||||
}
|
||||
|
||||
function urlParser(date: Element) {
|
||||
return date.getAttribute('href') ?? ''
|
||||
}
|
||||
|
||||
function attachmentParser(attachments: Element | null) {
|
||||
if (!attachments) return []
|
||||
|
||||
const photos = Array.from(attachments.querySelectorAll('img')).map(
|
||||
(i) => ({
|
||||
url: i.getAttribute('src') ?? '',
|
||||
type: 'photo',
|
||||
previewUrl: i.getAttribute('src') ?? '',
|
||||
})
|
||||
)
|
||||
const videos = Array.from(attachments.querySelectorAll('video')).map(
|
||||
(i) => ({
|
||||
url: i.getAttribute('data-url') ?? '',
|
||||
type: 'video',
|
||||
previewUrl: i.getAttribute('poster') ?? '',
|
||||
})
|
||||
)
|
||||
|
||||
return [...photos, ...videos]
|
||||
}
|
||||
|
||||
function parseTweet(tweet: Element): Tweet | null {
|
||||
const header = tweet.querySelector('.tweet-header')
|
||||
if (!header) {
|
||||
console.error('no header found', tweet)
|
||||
return null
|
||||
}
|
||||
const author = authorParser(header)
|
||||
|
||||
const body = tweet.querySelector('.tweet-body')
|
||||
if (!body) {
|
||||
console.error('no body found', tweet)
|
||||
return null
|
||||
}
|
||||
|
||||
const tweetDateElement = body.querySelector('.tweet-date a')
|
||||
if (!tweetDateElement) {
|
||||
console.error('no tweet date found', tweet)
|
||||
return null
|
||||
}
|
||||
const createdAt = dateParser(tweetDateElement)
|
||||
const url = urlParser(tweetDateElement)
|
||||
|
||||
const content = body.querySelector('.tweet-content')
|
||||
if (!content) {
|
||||
console.error('no content found', tweet)
|
||||
return null
|
||||
}
|
||||
const text = content.textContent ?? ''
|
||||
const urls = Array.from(content.querySelectorAll('a')).map((a) => ({
|
||||
url: a.getAttribute('href') ?? '',
|
||||
displayUrl: a.textContent ?? '',
|
||||
}))
|
||||
|
||||
const attachments = attachmentParser(body.querySelector('.attachments'))
|
||||
|
||||
return {
|
||||
author,
|
||||
createdAt,
|
||||
text,
|
||||
url,
|
||||
entities: {
|
||||
urls,
|
||||
},
|
||||
attachments,
|
||||
}
|
||||
}
|
||||
|
||||
const redisClient = await createRedisClient(
|
||||
process.env.REDIS_URL,
|
||||
process.env.REDIS_CERT
|
||||
)
|
||||
|
||||
try {
|
||||
const tweets: Tweet[] = []
|
||||
const option = {
|
||||
timeout: 20000, // 20 seconds
|
||||
}
|
||||
let html = ''
|
||||
// get instances from redis
|
||||
const instances = await this.getInstances(redisClient)
|
||||
for (const instance of instances) {
|
||||
try {
|
||||
const url = `${instance}/${username}/status/${tweetId}`
|
||||
const startTime = Date.now()
|
||||
const response = await axios.get(url, option)
|
||||
const latency = Math.floor(Date.now() - startTime)
|
||||
console.debug('latency', latency)
|
||||
|
||||
html = response.data as string
|
||||
this.instance = instance
|
||||
|
||||
await this.incrementInstanceScore(redisClient, instance, latency)
|
||||
break
|
||||
} catch (error) {
|
||||
await this.incrementInstanceScore(
|
||||
redisClient,
|
||||
instance,
|
||||
option.timeout
|
||||
)
|
||||
|
||||
if (axios.isAxiosError(error)) {
|
||||
console.info(`Error getting tweets from ${instance}`, error.message)
|
||||
} else {
|
||||
console.info(`Error getting tweets from ${instance}`, error)
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!this.instance || !html) {
|
||||
console.error('no instance or html found')
|
||||
return []
|
||||
}
|
||||
|
||||
const document = parseHTML(html).document
|
||||
|
||||
// get the main thread including tweets and threads
|
||||
const mainThread = document.querySelector('.main-thread')
|
||||
if (!mainThread) {
|
||||
console.error('no main thread found')
|
||||
return []
|
||||
}
|
||||
const timelineItems = Array.from(
|
||||
mainThread.querySelectorAll('.timeline-item')
|
||||
)
|
||||
if (timelineItems.length === 0) {
|
||||
console.error('no timeline items found')
|
||||
return []
|
||||
}
|
||||
for (let i = 0; i < timelineItems.length; i++) {
|
||||
const item = timelineItems[i]
|
||||
const classList = item.classList
|
||||
// skip unavailable tweets and earlier replies
|
||||
if (
|
||||
classList.contains('unavailable') ||
|
||||
classList.contains('earlier-replies')
|
||||
) {
|
||||
console.info('skip unavailable tweets and earlier replies')
|
||||
continue
|
||||
}
|
||||
// if there are more replies, get them
|
||||
if (classList.contains('more-replies')) {
|
||||
const newUrl = item.querySelector('a')?.getAttribute('href')
|
||||
if (!newUrl) {
|
||||
console.error('no new url', newUrl)
|
||||
break
|
||||
}
|
||||
|
||||
let html = ''
|
||||
try {
|
||||
// go to new url and wait for it to load
|
||||
const response = await axios.get(
|
||||
`${this.instance}${newUrl}`,
|
||||
option
|
||||
)
|
||||
|
||||
html = response.data as string
|
||||
} catch (error) {
|
||||
console.error('Error getting tweets', error)
|
||||
break
|
||||
}
|
||||
|
||||
const document = parseHTML(html).document
|
||||
const nextThread = document.querySelector('.main-thread .after-tweet')
|
||||
if (!nextThread) {
|
||||
console.error('no next thread found')
|
||||
break
|
||||
}
|
||||
|
||||
// get the new timeline items and add them to the list
|
||||
const newTimelineItems = Array.from(
|
||||
nextThread.querySelectorAll('.timeline-item')
|
||||
)
|
||||
|
||||
timelineItems.push(...newTimelineItems)
|
||||
continue
|
||||
}
|
||||
|
||||
const tweet = parseTweet(item)
|
||||
// filter out replies
|
||||
if (
|
||||
tweet &&
|
||||
tweet.author.username.toLowerCase() === username.toLowerCase()
|
||||
) {
|
||||
tweets.push(tweet)
|
||||
}
|
||||
}
|
||||
|
||||
return tweets
|
||||
} catch (error) {
|
||||
console.error('Error getting tweets', error)
|
||||
|
||||
return []
|
||||
} finally {
|
||||
await redisClient?.quit()
|
||||
}
|
||||
}
|
||||
|
||||
parseTweetUrl = (url: string) => {
|
||||
const match = url.match(this.URL_MATCH)
|
||||
return {
|
||||
domain: match?.[1],
|
||||
username: match?.[4],
|
||||
tweetId: match?.[5],
|
||||
}
|
||||
}
|
||||
|
||||
titleForTweet = (author: { name: string }, text: string) => {
|
||||
return `${author.name} on Twitter: ${truncate(text.replace(/http\S+/, ''), {
|
||||
length: 100,
|
||||
})}`
|
||||
}
|
||||
|
||||
formatTimestamp = (timestamp: string) => {
|
||||
return DateTime.fromJSDate(new Date(timestamp)).toLocaleString(
|
||||
DateTime.DATETIME_FULL
|
||||
)
|
||||
}
|
||||
|
||||
shouldPreHandle(url: string): boolean {
|
||||
return this.URL_MATCH.test(url.toString())
|
||||
}
|
||||
|
||||
async preHandle(url: string): Promise<PreHandleResult> {
|
||||
const { tweetId, username, domain } = this.parseTweetUrl(url)
|
||||
if (!tweetId || !username || !domain) {
|
||||
throw new Error('could not parse tweet url')
|
||||
}
|
||||
const tweets = await this.getTweets(username, tweetId)
|
||||
if (tweets.length === 0) {
|
||||
throw new Error('could not get tweets')
|
||||
}
|
||||
|
||||
const tweet = tweets[0]
|
||||
const author = tweet.author
|
||||
// escape html entities in title
|
||||
const title = this.titleForTweet(author, tweet.text)
|
||||
const escapedTitle = _.escape(title)
|
||||
const authorImage = `${this.instance}${author.profileImageUrl.replace(
|
||||
'_normal',
|
||||
'_400x400'
|
||||
)}`
|
||||
const description = _.escape(tweet.text) || escapedTitle
|
||||
const imageDomain =
|
||||
domain.toLowerCase() === 'twitter.com'
|
||||
? 'https://pbs.twimg.com'
|
||||
: 'https://nitter.net/pic'
|
||||
|
||||
let tweetsContent = ''
|
||||
for (const tweet of tweets) {
|
||||
let text = tweet.text
|
||||
for (const urlObj of tweet.entities.urls) {
|
||||
text = text.replace(
|
||||
urlObj.displayUrl,
|
||||
`<a href="${urlObj.url}">${urlObj.displayUrl}</a>`
|
||||
)
|
||||
}
|
||||
|
||||
const includesHtml = tweet.attachments
|
||||
.map(
|
||||
(attachment) =>
|
||||
`<a class="media-link" href=${imageDomain}${decodeURIComponent(
|
||||
attachment.url
|
||||
).replace('/pic', '')}>
|
||||
<picture>
|
||||
<img class="tweet-img" src=${imageDomain}${decodeURIComponent(
|
||||
attachment.previewUrl
|
||||
).replace('/pic', '')} />
|
||||
</picture>
|
||||
</a>`
|
||||
)
|
||||
.join('\n')
|
||||
|
||||
tweetsContent += `<p class="_omnivore_tweet_content">${text}</p>${includesHtml}`
|
||||
}
|
||||
|
||||
const tweetUrl = `
|
||||
— <a href="https://${domain}/${author.username}">${
|
||||
author.username
|
||||
}</a> <span itemscope itemtype="https://schema.org/Person" itemprop="author">${
|
||||
author.name
|
||||
}</span> <a href="${url}">${this.formatTimestamp(tweet.createdAt)}</a>`
|
||||
|
||||
const content = `
|
||||
<html>
|
||||
<head>
|
||||
<meta property="og:image" content="${authorImage}" />
|
||||
<meta property="og:image:secure_url" content="${authorImage}" />
|
||||
<meta property="og:title" content="${escapedTitle}" />
|
||||
<meta property="og:description" content="${description}" />
|
||||
<meta property="article:published_time" content="${tweet.createdAt}" />
|
||||
<meta property="og:site_name" content="Twitter" />
|
||||
<meta property="og:type" content="tweet" />
|
||||
<meta property="dc:creator" content="${author.name}" />
|
||||
<meta property="twitter:description" content="${description}" />
|
||||
</head>
|
||||
<body>
|
||||
<div class="_omnivore_twitter">
|
||||
${tweetsContent}
|
||||
${tweetUrl}
|
||||
</div>
|
||||
</body>
|
||||
</html>`
|
||||
|
||||
return { content, url, title }
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,18 @@
|
||||
import { ContentHandler, PreHandleResult } from '../content-handler'
|
||||
|
||||
export class PdfHandler extends ContentHandler {
|
||||
constructor() {
|
||||
super()
|
||||
this.name = 'PDF'
|
||||
}
|
||||
|
||||
shouldPreHandle(url: string): boolean {
|
||||
const u = new URL(url)
|
||||
const path = u.pathname.replace(u.search, '')
|
||||
return path.endsWith('.pdf')
|
||||
}
|
||||
|
||||
async preHandle(url: string): Promise<PreHandleResult> {
|
||||
return Promise.resolve({ contentType: 'application/pdf' })
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,83 @@
|
||||
import axios from 'axios'
|
||||
import _ from 'underscore'
|
||||
import { ContentHandler, PreHandleResult } from '../content-handler'
|
||||
|
||||
export class PipedVideoHandler extends ContentHandler {
|
||||
// https://piped.video/watch?v={videoId}
|
||||
PIPED_URL_MATCH = /^((?:https?:)?\/\/)?piped\.video\/watch\?v=[^&]+/
|
||||
|
||||
constructor() {
|
||||
super()
|
||||
this.name = 'Piped-video'
|
||||
}
|
||||
|
||||
getYoutubeVideoId = (url: string) => {
|
||||
const u = new URL(url)
|
||||
return u.searchParams.get('v')
|
||||
}
|
||||
|
||||
escapeTitle = (title: string) => {
|
||||
return _.escape(title)
|
||||
}
|
||||
|
||||
shouldPreHandle(url: string): boolean {
|
||||
return this.PIPED_URL_MATCH.test(url.toString())
|
||||
}
|
||||
|
||||
async preHandle(url: string): Promise<PreHandleResult> {
|
||||
const videoId = this.getYoutubeVideoId(url)
|
||||
if (!videoId) {
|
||||
return {}
|
||||
}
|
||||
const baseUrl = 'https://api-piped.mha.fi'
|
||||
const apiUrl = `${baseUrl}/streams/${videoId}`
|
||||
const metadata = (await axios.get(apiUrl)).data as {
|
||||
title: string
|
||||
thumbnailUrl: string
|
||||
uploader: string
|
||||
uploaderUrl: string
|
||||
uploadDate: string
|
||||
description: string
|
||||
videoStreams: {
|
||||
width: number
|
||||
height: number
|
||||
url: string
|
||||
}[]
|
||||
}
|
||||
const videoStreams = metadata.videoStreams
|
||||
if (!videoStreams || videoStreams.length == 0) {
|
||||
return {}
|
||||
}
|
||||
const videoStream = videoStreams[0]
|
||||
const src = `https://piped.mha.fi/embed/${videoId}`
|
||||
// escape html entities in title
|
||||
const title = metadata.title
|
||||
const escapedTitle = this.escapeTitle(title)
|
||||
const ratio = videoStream.width / videoStream.height
|
||||
const thumbnail = metadata.thumbnailUrl
|
||||
const height = 350
|
||||
const width = height * ratio
|
||||
const authorName = _.escape(metadata.uploader)
|
||||
const content = `
|
||||
<html>
|
||||
<head>
|
||||
<title>${escapedTitle}</title>
|
||||
<meta property="og:image" content="${thumbnail}" />
|
||||
<meta property="og:image:secure_url" content="${thumbnail}" />
|
||||
<meta property="og:title" content="${escapedTitle}" />
|
||||
<meta property="og:description" content="${metadata.description}" />
|
||||
<meta property="og:article:author" content="${authorName}" />
|
||||
<meta property="og:site_name" content="Piped Video" />
|
||||
<meta property="article:published_time" content="${metadata.uploadDate}" />
|
||||
<meta property="og:type" content="video" />
|
||||
</head>
|
||||
<body>
|
||||
<iframe width="${width}" height="${height}" src="${src}" title="${escapedTitle}" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
|
||||
<p><a href="${url}" target="_blank">${escapedTitle}</a></p>
|
||||
<p itemscope="" itemprop="author" itemtype="http://schema.org/Person">By <a href="https://piped.video${metadata.uploaderUrl}" target="_blank">${authorName}</a></p>
|
||||
</body>
|
||||
</html>`
|
||||
|
||||
return { content, title }
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,38 @@
|
||||
import { ContentHandler, PreHandleResult } from '../content-handler'
|
||||
import axios from 'axios'
|
||||
import { parseHTML } from 'linkedom'
|
||||
|
||||
export class ScrapingBeeHandler extends ContentHandler {
|
||||
constructor() {
|
||||
super()
|
||||
this.name = 'ScrapingBee'
|
||||
}
|
||||
|
||||
shouldPreHandle(url: string): boolean {
|
||||
const u = new URL(url)
|
||||
const hostnames = ['nytimes.com', 'news.google.com', 'fool.ca']
|
||||
|
||||
return hostnames.some((h) => u.hostname.endsWith(h))
|
||||
}
|
||||
|
||||
async preHandle(url: string): Promise<PreHandleResult> {
|
||||
console.log('prehandling url with scrapingbee', url)
|
||||
|
||||
try {
|
||||
const response = await axios.get('https://app.scrapingbee.com/api/v1', {
|
||||
params: {
|
||||
api_key: process.env.SCRAPINGBEE_API_KEY,
|
||||
url: url,
|
||||
return_page_source: true,
|
||||
block_ads: true,
|
||||
block_resources: false,
|
||||
},
|
||||
})
|
||||
const dom = parseHTML(response.data).document
|
||||
return { title: dom.title, content: response.data as string, url: url }
|
||||
} catch (error) {
|
||||
console.error('error prehandling url w/scrapingbee', error)
|
||||
throw error
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,121 @@
|
||||
import { ContentHandler } from '../content-handler'
|
||||
|
||||
export class StackOverflowHandler extends ContentHandler {
|
||||
constructor() {
|
||||
super()
|
||||
this.name = 'stackoverflow'
|
||||
}
|
||||
|
||||
parseText(element: Element, title: string) {
|
||||
const newText = element.ownerDocument.createElement('div')
|
||||
const text = element.querySelector(`div[itemprop='text']`)
|
||||
if (text) {
|
||||
const votes = element
|
||||
.querySelector(`div[itemprop='upvoteCount']`)
|
||||
?.getAttribute('data-value')
|
||||
|
||||
if (votes) {
|
||||
newText.innerHTML = `<h2>${title}: ${votes} vote${
|
||||
votes === '1' ? '' : 's'
|
||||
}</h2>${text.innerHTML}`
|
||||
}
|
||||
}
|
||||
return newText
|
||||
}
|
||||
|
||||
parseComments(element: Element) {
|
||||
const dom = element.ownerDocument
|
||||
const newComments = dom.createElement('div')
|
||||
|
||||
// comments
|
||||
const commentsDiv = element.querySelector(`.comments`)
|
||||
if (commentsDiv) {
|
||||
const comments = commentsDiv.querySelectorAll(`.comment`)
|
||||
if (comments.length > 0) {
|
||||
newComments.innerHTML = `<h3>Comments</h3>`
|
||||
|
||||
comments.forEach((comment) => {
|
||||
const author = comment.querySelector(`.comment-user`)
|
||||
const text = comment.querySelector(`.comment-copy`)?.textContent
|
||||
const authorHref = author?.getAttribute('href')
|
||||
const date = comment.querySelector(`.relativetime-clean`)?.textContent
|
||||
if (author && text && authorHref && date) {
|
||||
const newComment = dom.createElement('p')
|
||||
newComment.innerHTML = `<a href="${authorHref}"><b>${author.innerHTML}</b></a>: ${text} - ${date}`
|
||||
newComments.appendChild(newComment)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
return newComments
|
||||
}
|
||||
|
||||
parseAuthors(element: Element) {
|
||||
const dom = element.ownerDocument
|
||||
const newAuthors = dom.createElement('div')
|
||||
|
||||
const authors = element.querySelectorAll(`.post-signature`)
|
||||
authors.forEach((author) => {
|
||||
const isOwner = author.classList.contains('owner')
|
||||
const name = author.querySelector(`.user-details a`)?.textContent
|
||||
const link = author.querySelector(`.user-details a`)?.getAttribute('href')
|
||||
const reputation = author.querySelector(`.reputation-score`)?.textContent
|
||||
const badges = Array.from(
|
||||
author.querySelectorAll(`span[title*='badges']`)
|
||||
)
|
||||
.map((badge) => badge.getAttribute('title'))
|
||||
.join(', ')
|
||||
const date = author.querySelector(`.user-action-time`)?.textContent
|
||||
if (name && link && reputation && date) {
|
||||
const newAuthor = dom.createElement('p')
|
||||
newAuthor.innerHTML = `<a href="${link}"><b>${name}</b></a> - ${reputation} reputation - ${
|
||||
badges || 'no badge'
|
||||
} - ${date}`
|
||||
if (isOwner) {
|
||||
const author = dom.createElement('span')
|
||||
author.setAttribute('rel', 'author')
|
||||
author.innerHTML = name
|
||||
newAuthor.appendChild(author)
|
||||
}
|
||||
newAuthors.appendChild(newAuthor)
|
||||
}
|
||||
})
|
||||
|
||||
return newAuthors
|
||||
}
|
||||
|
||||
shouldPreParse(url: string, dom: Document): boolean {
|
||||
return new URL(url).hostname.endsWith('stackoverflow.com')
|
||||
}
|
||||
|
||||
async preParse(url: string, dom: Document): Promise<Document> {
|
||||
const mainEntity = dom.querySelector(`div[itemprop='mainEntity']`)
|
||||
if (mainEntity) {
|
||||
const newMainEntity = dom.createElement('div')
|
||||
const question = mainEntity.querySelector('#question')
|
||||
if (question) {
|
||||
newMainEntity.appendChild(this.parseText(question, 'Question'))
|
||||
newMainEntity.appendChild(this.parseAuthors(question))
|
||||
newMainEntity.appendChild(this.parseComments(question))
|
||||
}
|
||||
|
||||
const answersDiv = mainEntity.querySelector('#answers')
|
||||
if (answersDiv) {
|
||||
const answers = answersDiv.querySelectorAll(`.answer`)
|
||||
answers.forEach((answer) => {
|
||||
const title = answer.classList.contains('accepted-answer')
|
||||
? 'Accepted Answer'
|
||||
: 'Answer'
|
||||
newMainEntity.appendChild(this.parseText(answer, title))
|
||||
newMainEntity.appendChild(this.parseAuthors(answer))
|
||||
newMainEntity.appendChild(this.parseComments(answer))
|
||||
})
|
||||
}
|
||||
|
||||
dom.body.replaceChildren(newMainEntity)
|
||||
}
|
||||
|
||||
return Promise.resolve(dom)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,26 @@
|
||||
import { ContentHandler } from '../content-handler'
|
||||
import axios from 'axios'
|
||||
|
||||
export class TDotCoHandler extends ContentHandler {
|
||||
constructor() {
|
||||
super()
|
||||
this.name = 't.co'
|
||||
}
|
||||
|
||||
shouldResolve(url: string): boolean {
|
||||
const T_DOT_CO_URL_MATCH = /^https:\/\/(?:www\.)?t\.co\/.*$/
|
||||
return T_DOT_CO_URL_MATCH.test(url)
|
||||
}
|
||||
|
||||
async resolve(url: string) {
|
||||
return axios
|
||||
.get(url, { maxRedirects: 0, validateStatus: null })
|
||||
.then((res) => {
|
||||
return new URL(res.headers.location).href
|
||||
})
|
||||
.catch((err) => {
|
||||
console.log('err with t.co url', err)
|
||||
return undefined
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,59 @@
|
||||
import axios from 'axios'
|
||||
import { parseHTML } from 'linkedom'
|
||||
import { ContentHandler, PreHandleResult } from '../content-handler'
|
||||
|
||||
export class TheAtlanticHandler extends ContentHandler {
|
||||
constructor() {
|
||||
super()
|
||||
this.name = 'The Atlantic'
|
||||
}
|
||||
|
||||
shouldPreHandle(url: string): boolean {
|
||||
const u = new URL(url)
|
||||
return u.hostname.endsWith('theatlantic.com')
|
||||
}
|
||||
|
||||
removeRelatedContentLinks(articleContent: Element): Node[] {
|
||||
const content = Array.from(articleContent.children)
|
||||
return content.filter(
|
||||
(paragraph) => !paragraph.className.startsWith('ArticleRelated')
|
||||
)
|
||||
}
|
||||
|
||||
unfurlContent(content: Document): Document {
|
||||
const articleContentSection = content.querySelector(
|
||||
'[data-event-module="article body"]'
|
||||
)
|
||||
|
||||
// Remove the audio player.
|
||||
content.querySelector('[data-event-module="audio player"]')?.remove()
|
||||
|
||||
if (!articleContentSection) {
|
||||
return content
|
||||
}
|
||||
|
||||
const articleContent = this.removeRelatedContentLinks(articleContentSection)
|
||||
const divOverArticle = content.createElement('div')
|
||||
divOverArticle.setAttribute('id', 'prehandled')
|
||||
articleContent.forEach((it) => divOverArticle.appendChild(it))
|
||||
|
||||
content.insertBefore(divOverArticle, articleContentSection)
|
||||
articleContentSection.remove()
|
||||
|
||||
return content
|
||||
}
|
||||
|
||||
async preHandle(url: string): Promise<PreHandleResult> {
|
||||
// We simply retrieve the article without Javascript enabled using a GET command.
|
||||
const response = await axios.get(url)
|
||||
const data = response.data as string
|
||||
const dom = parseHTML(data).document
|
||||
const editedDom = this.unfurlContent(dom)
|
||||
|
||||
return {
|
||||
content: editedDom.body.outerHTML,
|
||||
title: dom.title,
|
||||
dom: editedDom,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,388 @@
|
||||
import axios from 'axios'
|
||||
import { truncate } from 'lodash'
|
||||
import { DateTime } from 'luxon'
|
||||
import { Browser, BrowserContext } from 'puppeteer-core'
|
||||
import _ from 'underscore'
|
||||
import { ContentHandler, PreHandleResult } from '../content-handler'
|
||||
|
||||
interface TweetIncludes {
|
||||
users: {
|
||||
id: string
|
||||
name: string
|
||||
profile_image_url: string
|
||||
username: string
|
||||
}[]
|
||||
media?: {
|
||||
preview_image_url: string
|
||||
type: string
|
||||
url: string
|
||||
media_key: string
|
||||
}[]
|
||||
}
|
||||
|
||||
interface TweetMeta {
|
||||
result_count: number
|
||||
}
|
||||
|
||||
interface TweetData {
|
||||
author_id: string
|
||||
text: string
|
||||
entities: {
|
||||
urls: {
|
||||
url: string
|
||||
expanded_url: string
|
||||
display_url: string
|
||||
}[]
|
||||
}
|
||||
created_at: string
|
||||
referenced_tweets: {
|
||||
type: string
|
||||
id: string
|
||||
}[]
|
||||
conversation_id: string
|
||||
attachments?: {
|
||||
media_keys: string[]
|
||||
}
|
||||
}
|
||||
|
||||
interface Tweet {
|
||||
data: TweetData
|
||||
includes: TweetIncludes
|
||||
}
|
||||
|
||||
interface Tweets {
|
||||
data: TweetData[]
|
||||
includes: TweetIncludes
|
||||
meta: TweetMeta
|
||||
}
|
||||
|
||||
const TWITTER_BEARER_TOKEN = process.env.TWITTER_BEARER_TOKEN
|
||||
const TWITTER_URL_MATCH =
|
||||
/twitter\.com\/(?:#!\/)?(\w+)\/status(?:es)?\/(\d+)(?:\/.*)?/
|
||||
const MAX_THREAD_DEPTH = 100
|
||||
|
||||
const getTweetFields = () => {
|
||||
const TWEET_FIELDS =
|
||||
'&tweet.fields=attachments,author_id,conversation_id,created_at,' +
|
||||
'entities,geo,in_reply_to_user_id,lang,possibly_sensitive,public_metrics,referenced_tweets,' +
|
||||
'source,withheld'
|
||||
const EXPANSIONS = '&expansions=author_id,attachments.media_keys'
|
||||
const USER_FIELDS =
|
||||
'&user.fields=created_at,description,entities,location,pinned_tweet_id,profile_image_url,protected,public_metrics,url,verified,withheld'
|
||||
const MEDIA_FIELDS =
|
||||
'&media.fields=duration_ms,height,preview_image_url,url,media_key,public_metrics,width'
|
||||
|
||||
return `${TWEET_FIELDS}${EXPANSIONS}${USER_FIELDS}${MEDIA_FIELDS}`
|
||||
}
|
||||
|
||||
// unroll recent tweet thread
|
||||
const getTweetThread = async (conversationId: string): Promise<Tweets> => {
|
||||
const BASE_ENDPOINT = 'https://api.twitter.com/2/tweets/search/recent'
|
||||
const apiUrl = new URL(
|
||||
BASE_ENDPOINT +
|
||||
'?query=' +
|
||||
encodeURIComponent(`conversation_id:${conversationId}`) +
|
||||
getTweetFields() +
|
||||
`&max_results=${MAX_THREAD_DEPTH}`
|
||||
)
|
||||
|
||||
if (!TWITTER_BEARER_TOKEN) {
|
||||
throw new Error('No Twitter bearer token found')
|
||||
}
|
||||
|
||||
const response = await axios.get<Tweets>(apiUrl.toString(), {
|
||||
headers: {
|
||||
Authorization: `Bearer ${TWITTER_BEARER_TOKEN}`,
|
||||
redirect: 'follow',
|
||||
},
|
||||
})
|
||||
return response.data
|
||||
}
|
||||
|
||||
const getTweetById = async (id: string): Promise<Tweet> => {
|
||||
const BASE_ENDPOINT = 'https://api.twitter.com/2/tweets/'
|
||||
const apiUrl = new URL(BASE_ENDPOINT + id + '?' + getTweetFields())
|
||||
|
||||
if (!TWITTER_BEARER_TOKEN) {
|
||||
throw new Error('No Twitter bearer token found')
|
||||
}
|
||||
|
||||
const response = await axios.get<Tweet>(apiUrl.toString(), {
|
||||
headers: {
|
||||
Authorization: `Bearer ${TWITTER_BEARER_TOKEN}`,
|
||||
redirect: 'follow',
|
||||
},
|
||||
})
|
||||
|
||||
return response.data
|
||||
}
|
||||
|
||||
const getTweetsByIds = async (ids: string[]): Promise<Tweets> => {
|
||||
const BASE_ENDPOINT = 'https://api.twitter.com/2/tweets?ids='
|
||||
const apiUrl = new URL(BASE_ENDPOINT + ids.join() + getTweetFields())
|
||||
|
||||
if (!TWITTER_BEARER_TOKEN) {
|
||||
throw new Error('No Twitter bearer token found')
|
||||
}
|
||||
|
||||
const response = await axios.get<Tweets>(apiUrl.toString(), {
|
||||
headers: {
|
||||
Authorization: `Bearer ${TWITTER_BEARER_TOKEN}`,
|
||||
redirect: 'follow',
|
||||
},
|
||||
})
|
||||
|
||||
return response.data
|
||||
}
|
||||
|
||||
const titleForTweet = (author: { name: string }, text: string) => {
|
||||
return `${author.name} on Twitter: ${truncate(text.replace(/http\S+/, ''), {
|
||||
length: 100,
|
||||
})}`
|
||||
}
|
||||
|
||||
const tweetIdFromStatusUrl = (url: string): string | undefined => {
|
||||
const match = url.toString().match(TWITTER_URL_MATCH)
|
||||
return match?.[2]
|
||||
}
|
||||
|
||||
const formatTimestamp = (timestamp: string) => {
|
||||
return DateTime.fromJSDate(new Date(timestamp)).toLocaleString(
|
||||
DateTime.DATETIME_FULL
|
||||
)
|
||||
}
|
||||
|
||||
const getTweetsFromResponse = (response: Tweets): Tweet[] => {
|
||||
const tweets = []
|
||||
for (const t of response.data) {
|
||||
const media = response.includes.media?.filter((m) =>
|
||||
t.attachments?.media_keys?.includes(m.media_key)
|
||||
)
|
||||
const tweet: Tweet = {
|
||||
data: t,
|
||||
includes: {
|
||||
users: response.includes.users,
|
||||
media,
|
||||
},
|
||||
}
|
||||
tweets.push(tweet)
|
||||
}
|
||||
return tweets
|
||||
}
|
||||
|
||||
const getOldTweets = async (
|
||||
browser: Browser,
|
||||
conversationId: string,
|
||||
username: string
|
||||
): Promise<Tweet[]> => {
|
||||
const tweetIds = await getTweetIds(browser, conversationId, username)
|
||||
if (tweetIds.length === 0) {
|
||||
return []
|
||||
}
|
||||
const response = await getTweetsByIds(tweetIds)
|
||||
return getTweetsFromResponse(response)
|
||||
}
|
||||
|
||||
const getRecentTweets = async (conversationId: string): Promise<Tweet[]> => {
|
||||
const thread = await getTweetThread(conversationId)
|
||||
if (thread.meta.result_count === 0) {
|
||||
return []
|
||||
}
|
||||
// tweets are in reverse chronological order in the thread
|
||||
return getTweetsFromResponse(thread).reverse()
|
||||
}
|
||||
|
||||
/**
|
||||
* Wait for `ms` amount of milliseconds
|
||||
* @param {number} ms
|
||||
*/
|
||||
const waitFor = (ms: number) =>
|
||||
new Promise((resolve) => setTimeout(resolve, ms))
|
||||
|
||||
/**
|
||||
* Get tweets(even older than 7 days) using puppeteer
|
||||
* @param browser
|
||||
* @param {string} tweetId
|
||||
* @param {string} author
|
||||
*/
|
||||
const getTweetIds = async (
|
||||
browser: Browser,
|
||||
tweetId: string,
|
||||
author: string
|
||||
): Promise<string[]> => {
|
||||
const pageURL = `https://twitter.com/${author}/status/${tweetId}`
|
||||
|
||||
let context: BrowserContext | undefined
|
||||
try {
|
||||
context = await browser.createIncognitoBrowserContext()
|
||||
const page = await context.newPage()
|
||||
|
||||
// Modify this variable to control the size of viewport
|
||||
const deviceScaleFactor = 0.2
|
||||
const height = Math.floor(2000 / deviceScaleFactor)
|
||||
const width = Math.floor(1700 / deviceScaleFactor)
|
||||
await page.setViewport({ width, height, deviceScaleFactor })
|
||||
|
||||
await page.goto(pageURL, {
|
||||
waitUntil: 'networkidle0',
|
||||
timeout: 60000, // 60 seconds
|
||||
})
|
||||
|
||||
return await page.evaluate(async (author) => {
|
||||
/**
|
||||
* Wait for `ms` amount of milliseconds
|
||||
* @param {number} ms
|
||||
*/
|
||||
const waitFor = (ms: number) =>
|
||||
new Promise((resolve) => setTimeout(resolve, ms))
|
||||
|
||||
const ids = []
|
||||
|
||||
// Find the first Show thread button and click it
|
||||
const showRepliesButton = Array.from(
|
||||
document.querySelectorAll('div[dir]')
|
||||
)
|
||||
.filter(
|
||||
(node) => node.children[0] && node.children[0].tagName === 'SPAN'
|
||||
)
|
||||
.find((node) => node.children[0].innerHTML === 'Show replies')
|
||||
|
||||
if (showRepliesButton) {
|
||||
;(showRepliesButton as HTMLElement).click()
|
||||
|
||||
await waitFor(2000)
|
||||
}
|
||||
|
||||
const timeNodes = Array.from(document.querySelectorAll('time'))
|
||||
|
||||
for (const timeNode of timeNodes) {
|
||||
/** @type {HTMLAnchorElement | HTMLSpanElement} */
|
||||
const timeContainerAnchor: HTMLAnchorElement | HTMLSpanElement | null =
|
||||
timeNode.parentElement
|
||||
if (!timeContainerAnchor) continue
|
||||
|
||||
if (timeContainerAnchor.tagName === 'SPAN') continue
|
||||
|
||||
const href = timeContainerAnchor.getAttribute('href')
|
||||
if (!href) continue
|
||||
|
||||
// Get the tweet id and username from the href: https://twitter.com/username/status/1234567890
|
||||
const match = href.match(/\/([^/]+)\/status\/(\d+)/)
|
||||
if (!match) continue
|
||||
|
||||
const id = match[2]
|
||||
const username = match[1]
|
||||
|
||||
// skip non-author replies
|
||||
username === author && ids.push(id)
|
||||
}
|
||||
|
||||
return ids
|
||||
}, author)
|
||||
} catch (error) {
|
||||
console.error('Error getting tweets', error)
|
||||
|
||||
return []
|
||||
} finally {
|
||||
if (context) {
|
||||
await context.close()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
export class TwitterHandler extends ContentHandler {
|
||||
constructor() {
|
||||
super()
|
||||
this.name = 'Twitter'
|
||||
}
|
||||
|
||||
shouldPreHandle(url: string): boolean {
|
||||
return !!TWITTER_BEARER_TOKEN && TWITTER_URL_MATCH.test(url.toString())
|
||||
}
|
||||
|
||||
async preHandle(url: string, browser: Browser): Promise<PreHandleResult> {
|
||||
const tweetId = tweetIdFromStatusUrl(url)
|
||||
if (!tweetId) {
|
||||
throw new Error('could not find tweet id in url')
|
||||
}
|
||||
let tweet = await getTweetById(tweetId)
|
||||
const conversationId = tweet.data.conversation_id
|
||||
if (conversationId !== tweetId) {
|
||||
// this is a reply, so we need to get the referenced tweet
|
||||
tweet = await getTweetById(conversationId)
|
||||
}
|
||||
|
||||
const tweetData = tweet.data
|
||||
const authorId = tweetData.author_id
|
||||
const author = tweet.includes.users.filter((u) => (u.id = authorId))[0]
|
||||
// escape html entities in title
|
||||
const title = titleForTweet(author, tweetData.text)
|
||||
const escapedTitle = _.escape(title)
|
||||
const authorImage = author.profile_image_url.replace('_normal', '_400x400')
|
||||
const description = _.escape(tweetData.text)
|
||||
|
||||
// use puppeteer to get all tweet replies in the thread
|
||||
const tweets = await getOldTweets(browser, conversationId, author.username)
|
||||
|
||||
let tweetsContent = ''
|
||||
for (const tweet of tweets) {
|
||||
const tweetData = tweet.data
|
||||
let text = tweetData.text
|
||||
if (tweetData.entities && tweetData.entities.urls) {
|
||||
for (const urlObj of tweetData.entities.urls) {
|
||||
text = text.replace(
|
||||
urlObj.url,
|
||||
`<a href="${urlObj.expanded_url}">${urlObj.display_url}</a>`
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
const includesHtml =
|
||||
tweet.includes.media
|
||||
?.map((m) => {
|
||||
const linkUrl = m.type == 'photo' ? m.url : url
|
||||
const previewUrl = m.type == 'photo' ? m.url : m.preview_image_url
|
||||
return `<a class="media-link" href=${linkUrl}>
|
||||
<picture>
|
||||
<img class="tweet-img" src=${previewUrl} />
|
||||
</picture>
|
||||
</a>`
|
||||
})
|
||||
.join('\n') ?? ''
|
||||
|
||||
tweetsContent += `
|
||||
<p>${text}</p>
|
||||
${includesHtml}
|
||||
`
|
||||
}
|
||||
|
||||
const tweetUrl = `
|
||||
— <a href="https://twitter.com/${author.username}">${
|
||||
author.username
|
||||
}</a> <span itemscope itemtype="https://schema.org/Person" itemprop="author">${
|
||||
author.name
|
||||
}</span> <a href="${url}">${formatTimestamp(tweetData.created_at)}</a>
|
||||
`
|
||||
|
||||
const content = `
|
||||
<html>
|
||||
<head>
|
||||
<meta property="og:image" content="${authorImage}" />
|
||||
<meta property="og:image:secure_url" content="${authorImage}" />
|
||||
<meta property="og:title" content="${escapedTitle}" />
|
||||
<meta property="og:description" content="${description}" />
|
||||
<meta property="article:published_time" content="${tweetData.created_at}" />
|
||||
<meta property="og:site_name" content="Twitter" />
|
||||
<meta property="og:type" content="tweet" />
|
||||
</head>
|
||||
<body>
|
||||
<div>
|
||||
${tweetsContent}
|
||||
${tweetUrl}
|
||||
</div>
|
||||
</body>
|
||||
</html>`
|
||||
|
||||
return { content, url, title }
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,48 @@
|
||||
import { DateTime } from 'luxon'
|
||||
import { ContentHandler } from '../content-handler'
|
||||
|
||||
export class WeixinQqHandler extends ContentHandler {
|
||||
constructor() {
|
||||
super()
|
||||
this.name = 'Weixin QQ'
|
||||
}
|
||||
|
||||
shouldPreParse(url: string, dom: Document): boolean {
|
||||
return new URL(url).hostname.endsWith('weixin.qq.com')
|
||||
}
|
||||
|
||||
async preParse(url: string, dom: Document): Promise<Document> {
|
||||
// Retrieve the publish time
|
||||
const publishTime = dom.querySelector('#publish_time')?.textContent
|
||||
if (publishTime) {
|
||||
const dateTimeFormat = 'yyyy-LL-dd HH:mm'
|
||||
// published time is in UTC+8
|
||||
const publishTimeISO = DateTime.fromFormat(publishTime, dateTimeFormat, {
|
||||
zone: 'Asia/Shanghai',
|
||||
}).toISO()
|
||||
|
||||
// create a meta node to store the publish time in ISO format
|
||||
const metaNode = dom.createElement('meta')
|
||||
metaNode.setAttribute('name', 'date')
|
||||
metaNode.setAttribute('content', publishTimeISO)
|
||||
dom.querySelector('head')?.appendChild(metaNode)
|
||||
}
|
||||
// This replace the class name of the article info to preserve the block
|
||||
dom
|
||||
.querySelector('.rich_media_meta_list')
|
||||
?.setAttribute('class', '_omnivore_rich_media_meta_list')
|
||||
|
||||
// This removes the title
|
||||
dom.querySelector('.rich_media_title')?.remove()
|
||||
|
||||
// This removes the profile info
|
||||
dom.querySelector('.profile_container')?.remove()
|
||||
|
||||
// This removes the footer
|
||||
dom.querySelector('#content_bottom_area')?.remove()
|
||||
dom.querySelector('.rich_media_area_extra')?.remove()
|
||||
dom.querySelector('#js_pc_qr_code')?.remove()
|
||||
|
||||
return Promise.resolve(dom)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,24 @@
|
||||
import { ContentHandler } from '../content-handler'
|
||||
|
||||
export class WikipediaHandler extends ContentHandler {
|
||||
constructor() {
|
||||
super()
|
||||
this.name = 'wikipedia'
|
||||
}
|
||||
|
||||
shouldPreParse(url: string, dom: Document): boolean {
|
||||
return new URL(url).hostname.endsWith('wikipedia.org')
|
||||
}
|
||||
|
||||
async preParse(url: string, dom: Document): Promise<Document> {
|
||||
// This removes the [edit] anchors from wikipedia pages
|
||||
dom.querySelectorAll('.mw-editsection').forEach((e) => e.remove())
|
||||
|
||||
// Remove footnotes
|
||||
dom.querySelectorAll('sup[class="reference"]').forEach((e) => e.remove())
|
||||
|
||||
// this removes the sidebar
|
||||
dom.querySelector('.infobox')?.remove()
|
||||
return Promise.resolve(dom)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,59 @@
|
||||
import axios from 'axios'
|
||||
import { parseHTML } from 'linkedom'
|
||||
import { ContentHandler, PreHandleResult } from '../content-handler'
|
||||
|
||||
export class WiredHandler extends ContentHandler {
|
||||
constructor() {
|
||||
super()
|
||||
this.name = 'Wired'
|
||||
}
|
||||
|
||||
// We check if this is a paywalled document, as paywalled documents will have <p> tags
|
||||
// in the body.
|
||||
isPaywalledContent(document: Document): boolean {
|
||||
return document.getElementsByClassName('paywall').length > 0
|
||||
}
|
||||
|
||||
removeNonArticleNodes(document: Document): Document {
|
||||
const genericCallouts = Array.from(
|
||||
document.querySelectorAll('[data-testid="GenericCallout"]')
|
||||
)
|
||||
const ads = Array.from(document.querySelectorAll('.ad__slot')).map(
|
||||
(it) => it.parentElement
|
||||
)
|
||||
const mostPopularArticles = Array.from(
|
||||
document.querySelectorAll('[data-most-popular-id]')
|
||||
)
|
||||
|
||||
;[...genericCallouts, ...ads, ...mostPopularArticles].forEach((it) =>
|
||||
it?.remove()
|
||||
)
|
||||
|
||||
return document
|
||||
}
|
||||
|
||||
shouldPreHandle(url: string): boolean {
|
||||
const u = new URL(url)
|
||||
return u.hostname.endsWith('wired.com')
|
||||
}
|
||||
|
||||
async preHandle(url: string): Promise<PreHandleResult> {
|
||||
const response = await axios.get(url)
|
||||
const data = response.data as string
|
||||
const dom = parseHTML(data).document
|
||||
|
||||
if (!this.isPaywalledContent(dom)) {
|
||||
// This is just to ensure that the currently working articles don't break.
|
||||
// Looking further into this, they might all have paywalls?
|
||||
return {}
|
||||
}
|
||||
|
||||
const cleanedArticleDom = this.removeNonArticleNodes(dom)
|
||||
|
||||
return {
|
||||
content: cleanedArticleDom.body.outerHTML,
|
||||
title: dom.title,
|
||||
dom: cleanedArticleDom,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,97 @@
|
||||
import { ContentHandler, PreHandleResult } from '../content-handler'
|
||||
import axios from 'axios'
|
||||
import _ from 'underscore'
|
||||
|
||||
const YOUTUBE_URL_MATCH =
|
||||
/^((?:https?:)?\/\/)?((?:www|m)\.)?((?:youtube\.com|youtu.be))(\/(?:[\w-]+\?v=|embed\/|v\/)?)([\w-]+)(\S+)?$/
|
||||
|
||||
export const getYoutubeVideoId = (url: string) => {
|
||||
const u = new URL(url)
|
||||
const videoId = u.searchParams.get('v')
|
||||
if (!videoId) {
|
||||
const match = url.toString().match(YOUTUBE_URL_MATCH)
|
||||
if (match === null || match.length < 6 || !match[5]) {
|
||||
return undefined
|
||||
}
|
||||
return match[5]
|
||||
}
|
||||
return videoId
|
||||
}
|
||||
|
||||
export const getYoutubePlaylistId = (url: string) => {
|
||||
const u = new URL(url)
|
||||
return u.searchParams.get('list')
|
||||
}
|
||||
|
||||
export const escapeTitle = (title: string) => {
|
||||
return _.escape(title)
|
||||
}
|
||||
|
||||
export class YoutubeHandler extends ContentHandler {
|
||||
constructor() {
|
||||
super()
|
||||
this.name = 'Youtube'
|
||||
}
|
||||
|
||||
shouldPreHandle(url: string): boolean {
|
||||
return YOUTUBE_URL_MATCH.test(url.toString())
|
||||
}
|
||||
|
||||
async preHandle(url: string): Promise<PreHandleResult> {
|
||||
const BaseUrl = 'https://www.youtube.com'
|
||||
const embedBaseUrl = 'https://www.youtube.com/embed'
|
||||
let urlToEncode: string
|
||||
let src: string
|
||||
const playlistId = getYoutubePlaylistId(url)
|
||||
if (playlistId) {
|
||||
urlToEncode = `${BaseUrl}/playlist?list=${playlistId}`
|
||||
src = `${embedBaseUrl}/videoseries?list=${playlistId}`
|
||||
} else {
|
||||
const videoId = getYoutubeVideoId(url)
|
||||
if (!videoId) {
|
||||
return {}
|
||||
}
|
||||
urlToEncode = `${BaseUrl}/watch?v=${videoId}`
|
||||
src = `${embedBaseUrl}/${videoId}`
|
||||
}
|
||||
|
||||
const oembedUrl =
|
||||
`https://www.youtube.com/oembed?format=json&url=` +
|
||||
encodeURIComponent(urlToEncode)
|
||||
const oembed = (await axios.get(oembedUrl.toString())).data as {
|
||||
title: string
|
||||
width: number
|
||||
height: number
|
||||
thumbnail_url: string
|
||||
author_name: string
|
||||
author_url: string
|
||||
}
|
||||
// escape html entities in title
|
||||
const title = oembed.title
|
||||
const escapedTitle = escapeTitle(title)
|
||||
const ratio = oembed.width / oembed.height
|
||||
const thumbnail = oembed.thumbnail_url
|
||||
const height = 350
|
||||
const width = height * ratio
|
||||
const authorName = _.escape(oembed.author_name)
|
||||
const content = `
|
||||
<html>
|
||||
<head><title>${escapedTitle}</title>
|
||||
<meta property="og:image" content="${thumbnail}" />
|
||||
<meta property="og:image:secure_url" content="${thumbnail}" />
|
||||
<meta property="og:title" content="${escapedTitle}" />
|
||||
<meta property="og:description" content="" />
|
||||
<meta property="og:article:author" content="${authorName}" />
|
||||
<meta property="og:site_name" content="YouTube" />
|
||||
<meta property="og:type" content="video" />
|
||||
</head>
|
||||
<body>
|
||||
<iframe width="${width}" height="${height}" src="${src}" title="${escapedTitle}" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
|
||||
<p><a href="${url}" target="_blank">${escapedTitle}</a></p>
|
||||
<p itemscope="" itemprop="author" itemtype="http://schema.org/Person">By <a href="${oembed.author_url}" target="_blank">${authorName}</a></p>
|
||||
</body>
|
||||
</html>`
|
||||
|
||||
return { content, title }
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,117 @@
|
||||
import { ContentHandler } from '../content-handler'
|
||||
|
||||
export class ZhihuHandler extends ContentHandler {
|
||||
constructor() {
|
||||
super()
|
||||
this.name = 'zhihu'
|
||||
}
|
||||
|
||||
parseQuestion(element: Element) {
|
||||
const newQuestion = element.ownerDocument.createElement('div')
|
||||
const question = element.querySelector(`.QuestionHeader-main`)
|
||||
if (question) {
|
||||
const votes = element
|
||||
.querySelector(`div[itemprop='upvoteCount']`)
|
||||
?.getAttribute('data-value')
|
||||
|
||||
if (votes) {
|
||||
newQuestion.innerHTML = `<h2>问题: ${votes} vote${
|
||||
votes === '1' ? '' : 's'
|
||||
}</h2>${question.innerHTML}`
|
||||
}
|
||||
}
|
||||
return newQuestion
|
||||
}
|
||||
|
||||
parseComments(element: Element) {
|
||||
const dom = element.ownerDocument
|
||||
const newComments = dom.createElement('div')
|
||||
|
||||
// comments
|
||||
const commentsDiv = element.querySelector(`.comments`)
|
||||
if (commentsDiv) {
|
||||
const comments = commentsDiv.querySelectorAll(`.comment`)
|
||||
if (comments.length > 0) {
|
||||
newComments.innerHTML = `<h3>Comments</h3>`
|
||||
|
||||
comments.forEach((comment) => {
|
||||
const author = comment.querySelector(`.comment-user`)
|
||||
const text = comment.querySelector(`.comment-copy`)?.textContent
|
||||
const authorHref = author?.getAttribute('href')
|
||||
const date = comment.querySelector(`.relativetime-clean`)?.textContent
|
||||
if (author && text && authorHref && date) {
|
||||
const newComment = dom.createElement('p')
|
||||
newComment.innerHTML = `<a href="${authorHref}"><b>${author.innerHTML}</b></a>: ${text} - ${date}`
|
||||
newComments.appendChild(newComment)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
return newComments
|
||||
}
|
||||
|
||||
parseAuthors(element: Element) {
|
||||
const dom = element.ownerDocument
|
||||
const newAuthors = dom.createElement('div')
|
||||
|
||||
const authors = element.querySelectorAll(`.post-signature`)
|
||||
authors.forEach((author) => {
|
||||
const isOwner = author.classList.contains('owner')
|
||||
const name = author.querySelector(`.user-details a`)?.textContent
|
||||
const link = author.querySelector(`.user-details a`)?.getAttribute('href')
|
||||
const reputation = author.querySelector(`.reputation-score`)?.textContent
|
||||
const badges = Array.from(
|
||||
author.querySelectorAll(`span[title*='badges']`)
|
||||
)
|
||||
.map((badge) => badge.getAttribute('title'))
|
||||
.join(', ')
|
||||
const date = author.querySelector(`.user-action-time`)?.textContent
|
||||
if (name && link && reputation && date) {
|
||||
const newAuthor = dom.createElement('p')
|
||||
newAuthor.innerHTML = `<a href="${link}"><b>${name}</b></a> - ${reputation} reputation - ${
|
||||
badges || 'no badge'
|
||||
} - ${date}`
|
||||
if (isOwner) {
|
||||
const author = dom.createElement('span')
|
||||
author.setAttribute('rel', 'author')
|
||||
author.innerHTML = name
|
||||
newAuthor.appendChild(author)
|
||||
}
|
||||
newAuthors.appendChild(newAuthor)
|
||||
}
|
||||
})
|
||||
|
||||
return newAuthors
|
||||
}
|
||||
|
||||
shouldPreParse(url: string, dom: Document): boolean {
|
||||
return new URL(url).hostname.endsWith('zhihu.com')
|
||||
}
|
||||
|
||||
async preParse(url: string, dom: Document): Promise<Document> {
|
||||
const mainEntity = dom.querySelector(`div[itemprop='mainEntity']`)
|
||||
if (mainEntity) {
|
||||
const newMainEntity = dom.createElement('div')
|
||||
const question = mainEntity.querySelector('.QuestionHeader')
|
||||
if (question) {
|
||||
question.className = '_omnivore_zhihu_question'
|
||||
newMainEntity.appendChild(question)
|
||||
}
|
||||
|
||||
const answers = mainEntity.querySelectorAll('.ContentItem.AnswerItem')
|
||||
answers.forEach((answer) => {
|
||||
answer
|
||||
.querySelector('.AuthorInfo')
|
||||
?.setAttribute('class', '_omnivore_zhihu_author')
|
||||
|
||||
answer.className = '_omnivore_zhihu_answer'
|
||||
newMainEntity.appendChild(answer)
|
||||
})
|
||||
|
||||
dom.body.replaceChildren(newMainEntity)
|
||||
}
|
||||
|
||||
return Promise.resolve(dom)
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user