新增构建OpenSSL镜像相关文件

2024-03-15 14:52:38 +08:00
parent 43337c1a0b
commit 132c17af2d
10119 changed files with 1581963 additions and 0 deletions
--- a/examples/omnivore/api/content-handler/src/content-handler.ts
+++ b/examples/omnivore/api/content-handler/src/content-handler.ts
@@ -0,0 +1,189 @@
+import addressparser from 'addressparser'
+import axios from 'axios'
+import { parseHTML } from 'linkedom'
+import { Browser } from 'puppeteer-core'
+import { v4 as uuid } from 'uuid'
+
+interface Unsubscribe {
+  mailTo?: string
+  httpUrl?: string
+}
+
+export interface NewsletterInput {
+  from: string
+  to: string
+  subject: string
+  html: string
+  headers: Record<string, string | string[]>
+}
+
+export interface NewsletterResult {
+  email: string
+  content: string
+  url: string
+  title: string
+  author: string
+  unsubMailTo?: string
+  unsubHttpUrl?: string
+}
+
+export interface PreHandleResult {
+  url?: string
+  title?: string
+  content?: string
+  contentType?: string
+  dom?: Document
+}
+
+export const FAKE_URL_PREFIX = 'https://omnivore.app/no_url?q='
+export const generateUniqueUrl = () => FAKE_URL_PREFIX + uuid()
+
+export abstract class ContentHandler {
+  protected senderRegex: RegExp
+  protected urlRegex: RegExp
+  name: string
+
+  protected constructor() {
+    this.senderRegex = new RegExp(/NEWSLETTER_SENDER_REGEX/)
+    this.urlRegex = new RegExp(/NEWSLETTER_URL_REGEX/)
+    this.name = 'Handler name'
+  }
+
+  shouldResolve(url: string): boolean {
+    return false
+  }
+
+  async resolve(url: string): Promise<string | undefined> {
+    return Promise.resolve(url)
+  }
+
+  shouldPreHandle(url: string): boolean {
+    return false
+  }
+
+  async preHandle(url: string, browser?: Browser): Promise<PreHandleResult> {
+    return Promise.resolve({ url })
+  }
+
+  shouldPreParse(url: string, dom: Document): boolean {
+    return false
+  }
+
+  async preParse(url: string, dom: Document): Promise<Document> {
+    return Promise.resolve(dom)
+  }
+
+  async isNewsletter(input: {
+    from: string
+    html: string
+    headers: Record<string, string | string[]>
+    dom: Document
+  }): Promise<boolean> {
+    const re = new RegExp(this.senderRegex)
+    const postHeader = input.headers['list-post']
+    const unSubHeader = input.headers['list-unsubscribe']
+    return Promise.resolve(
+      re.test(input.from) && (!!postHeader || !!unSubHeader)
+    )
+  }
+
+  findNewsletterHeaderHref(dom: Document): string | undefined {
+    return undefined
+  }
+
+  // Given an HTML blob tries to find a URL to use for
+  // a canonical URL.
+  async findNewsletterUrl(html: string): Promise<string | undefined> {
+    const dom = parseHTML(html).document
+
+    // Check if this is a substack newsletter
+    const href = this.findNewsletterHeaderHref(dom)
+    if (href) {
+      // Try to make a HEAD request, so we get the redirected URL, since these
+      // will usually be behind tracking url redirects
+      try {
+        const response = await axios.head(href, { timeout: 5000 })
+        return Promise.resolve(
+          // eslint-disable-next-line @typescript-eslint/no-unsafe-member-access
+          response.request.res.responseUrl as string | undefined
+        )
+      } catch (e) {
+        console.log('error making HEAD request', e)
+        return Promise.resolve(href)
+      }
+    }
+
+    return Promise.resolve(undefined)
+  }
+
+  async parseNewsletterUrl(
+    headers: Record<string, string | string[]>,
+    html: string
+  ): Promise<string | undefined> {
+    // get url from dom
+    const url = await this.findNewsletterUrl(html)
+    if (url) {
+      return url
+    }
+    // get newsletter url from html
+    const matches = html.match(this.urlRegex)
+    if (matches) {
+      return matches[1]
+    }
+    return undefined
+  }
+
+  parseAuthor(from: string): string {
+    // get author name from email
+    // e.g. 'Jackson Harper from Omnivore App <jacksonh@substack.com>'
+    // or 'Mike Allen <mike@axios.com>'
+    const parsed = addressparser(from)
+    if (parsed.length > 0 && parsed[0].name) {
+      return parsed[0].name
+    }
+    return from
+  }
+
+  parseUnsubscribe(unSubHeader: string): Unsubscribe {
+    // parse list-unsubscribe header
+    // e.g. List-Unsubscribe: <https://omnivore.com/unsub>, <mailto:unsub@omnivore.com>
+    return {
+      httpUrl: unSubHeader.match(/<(https?:\/\/[^>]*)>/)?.[1],
+      mailTo: unSubHeader.match(/<mailto:([^>]*)>/)?.[1],
+    }
+  }
+
+  async handleNewsletter({
+    from,
+    to,
+    subject,
+    html,
+    headers,
+  }: NewsletterInput): Promise<NewsletterResult> {
+    console.log('handleNewsletter', from, to, subject, headers)
+
+    if (!from || !html || !subject || !to) {
+      console.log('invalid newsletter email')
+      throw new Error('invalid newsletter email')
+    }
+
+    // fallback to default url if newsletter url does not exist
+    // assign a random uuid to the default url to avoid duplicate url
+    const url =
+      (await this.parseNewsletterUrl(headers, html)) || generateUniqueUrl()
+    const author = this.parseAuthor(from)
+    const unsubscribe = headers['list-unsubscribe']
+      ? this.parseUnsubscribe(headers['list-unsubscribe'].toString())
+      : undefined
+
+    return {
+      email: to,
+      content: html,
+      url,
+      title: subject,
+      author,
+      unsubMailTo: unsubscribe?.mailTo || '',
+      unsubHttpUrl: unsubscribe?.httpUrl || '',
+    }
+  }
+}
--- a/examples/omnivore/api/content-handler/src/index.ts
+++ b/examples/omnivore/api/content-handler/src/index.ts
@@ -0,0 +1,186 @@
+import { parseHTML } from 'linkedom'
+import { Browser } from 'puppeteer-core'
+import {
+  ContentHandler,
+  NewsletterInput,
+  NewsletterResult,
+  PreHandleResult,
+} from './content-handler'
+import { AxiosHandler } from './newsletters/axios-handler'
+import { BeehiivHandler } from './newsletters/beehiiv-handler'
+import { BloombergNewsletterHandler } from './newsletters/bloomberg-newsletter-handler'
+import { ConvertkitHandler } from './newsletters/convertkit-handler'
+import { CooperPressHandler } from './newsletters/cooper-press-handler'
+import { EnergyWorldHandler } from './newsletters/energy-world'
+import { EveryIoHandler } from './newsletters/every-io-handler'
+import { GenericHandler } from './newsletters/generic-handler'
+import { GhostHandler } from './newsletters/ghost-handler'
+import { GolangHandler } from './newsletters/golang-handler'
+import { HeyWorldHandler } from './newsletters/hey-world-handler'
+import { IndiaTimesHandler } from './newsletters/india-times-handler'
+import { MorningBrewHandler } from './newsletters/morning-brew-handler'
+import { RevueHandler } from './newsletters/revue-handler'
+import { SubstackHandler } from './newsletters/substack-handler'
+import { AppleNewsHandler } from './websites/apple-news-handler'
+import { ArsTechnicaHandler } from './websites/ars-technica-handler'
+import { BloombergHandler } from './websites/bloomberg-handler'
+import { DerstandardHandler } from './websites/derstandard-handler'
+import { GitHubHandler } from './websites/github-handler'
+import { ImageHandler } from './websites/image-handler'
+import { MediumHandler } from './websites/medium-handler'
+import { NitterHandler } from './websites/nitter-handler'
+import { PdfHandler } from './websites/pdf-handler'
+import { PipedVideoHandler } from './websites/piped-video-handler'
+import { ScrapingBeeHandler } from './websites/scrapingBee-handler'
+import { StackOverflowHandler } from './websites/stack-overflow-handler'
+import { TDotCoHandler } from './websites/t-dot-co-handler'
+import { TheAtlanticHandler } from './websites/the-atlantic-handler'
+import { WeixinQqHandler } from './websites/weixin-qq-handler'
+import { WikipediaHandler } from './websites/wikipedia-handler'
+import { YoutubeHandler } from './websites/youtube-handler'
+import { ZhihuHandler } from './websites/zhihu-handler'
+
+const validateUrlString = (url: string): boolean => {
+  const u = new URL(url)
+  // Make sure the URL is http or https
+  if (u.protocol !== 'http:' && u.protocol !== 'https:') {
+    throw new Error('Invalid URL protocol check failed')
+  }
+  // Make sure the domain is not localhost
+  if (u.hostname === 'localhost' || u.hostname === '0.0.0.0') {
+    throw new Error('Invalid URL is localhost')
+  }
+  // Make sure the domain is not a private IP
+  if (/^(10|172\.16|192\.168)\..*/.test(u.hostname)) {
+    throw new Error('Invalid URL is private ip')
+  }
+
+  return true
+}
+
+const contentHandlers: ContentHandler[] = [
+  new ArsTechnicaHandler(),
+  new TheAtlanticHandler(),
+  new AppleNewsHandler(),
+  new BloombergHandler(),
+  new DerstandardHandler(),
+  new ImageHandler(),
+  new MediumHandler(),
+  new PdfHandler(),
+  new ScrapingBeeHandler(),
+  new TDotCoHandler(),
+  new YoutubeHandler(),
+  new WikipediaHandler(),
+  new GitHubHandler(),
+  new AxiosHandler(),
+  new GolangHandler(),
+  new MorningBrewHandler(),
+  new BloombergNewsletterHandler(),
+  new SubstackHandler(),
+  new StackOverflowHandler(),
+  new EnergyWorldHandler(),
+  new PipedVideoHandler(),
+  new WeixinQqHandler(),
+  new NitterHandler(),
+  new ZhihuHandler(),
+]
+
+const newsletterHandlers: ContentHandler[] = [
+  new AxiosHandler(),
+  new BloombergNewsletterHandler(),
+  new GolangHandler(),
+  new SubstackHandler(),
+  new MorningBrewHandler(),
+  new BeehiivHandler(),
+  new ConvertkitHandler(),
+  new RevueHandler(),
+  new GhostHandler(),
+  new CooperPressHandler(),
+  new HeyWorldHandler(),
+  new GenericHandler(),
+  new EveryIoHandler(),
+  new EnergyWorldHandler(),
+  new IndiaTimesHandler(),
+]
+
+export const preHandleContent = async (
+  url: string,
+  browser: Browser
+): Promise<PreHandleResult | undefined> => {
+  // Before we run the regular handlers we check to see if we need tp
+  // pre-resolve the URL. TODO: This should probably happen recursively,
+  // so URLs can be pre-resolved, handled, pre-resolved, handled, etc.
+  for (const handler of contentHandlers) {
+    if (handler.shouldResolve(url)) {
+      try {
+        const resolvedUrl = await handler.resolve(url)
+        if (resolvedUrl && validateUrlString(resolvedUrl)) {
+          url = resolvedUrl
+        }
+      } catch (err) {
+        console.log('error resolving url with handler', handler.name, err)
+      }
+      break
+    }
+  }
+  // Before we fetch the page we check the handlers, to see if they want
+  // to perform a prefetch action that can modify our requests.
+  // enumerate the handlers and see if any of them want to handle the request
+  for (const handler of contentHandlers) {
+    if (handler.shouldPreHandle(url)) {
+      console.log('preHandleContent', handler.name, url)
+      return handler.preHandle(url, browser)
+    }
+  }
+  return undefined
+}
+
+export const preParseContent = async (
+  url: string,
+  dom: Document
+): Promise<Document | undefined> => {
+  // Before we parse the page we check the handlers, to see if they want
+  // to perform a preParse action that can modify our dom.
+  // enumerate the handlers and see if any of them want to handle the dom
+  for (const handler of contentHandlers) {
+    if (handler.shouldPreParse(url, dom)) {
+      console.log('preParseContent', handler.name, url)
+      return handler.preParse(url, dom)
+    }
+  }
+  return undefined
+}
+
+export const getNewsletterHandler = async (input: {
+  from: string
+  html: string
+  headers: Record<string, string | string[]>
+}): Promise<ContentHandler | undefined> => {
+  const dom = parseHTML(input.html).document
+  for (const handler of newsletterHandlers) {
+    if (await handler.isNewsletter({ ...input, dom })) {
+      return handler
+    }
+  }
+
+  return undefined
+}
+
+export const handleNewsletter = async (
+  input: NewsletterInput
+): Promise<NewsletterResult | undefined> => {
+  const handler = await getNewsletterHandler(input)
+  if (handler) {
+    console.log('handleNewsletter', handler.name, input.subject)
+    return handler.handleNewsletter(input)
+  }
+
+  return undefined
+}
+
+module.exports = {
+  preHandleContent,
+  handleNewsletter,
+  preParseContent,
+  getNewsletterHandler,
+}
--- a/examples/omnivore/api/content-handler/src/newsletters/axios-handler.ts
+++ b/examples/omnivore/api/content-handler/src/newsletters/axios-handler.ts
@@ -0,0 +1,46 @@
+import { ContentHandler } from '../content-handler'
+
+export class AxiosHandler extends ContentHandler {
+  constructor() {
+    super()
+    this.senderRegex = /<.+@axios.com>/
+    this.urlRegex = /View in browser at <a.*>(.*)<\/a>/
+    this.name = 'axios'
+  }
+
+  shouldPreParse(url: string, dom: Document): boolean {
+    const host = this.name + '.com'
+    // check if url ends with axios.com
+    return new URL(url).hostname.endsWith(host)
+  }
+
+  async preParse(url: string, dom: Document): Promise<Document> {
+    const body = dom.querySelector('table')
+
+    let isFooter = false
+    // this removes ads and replaces table with a div
+    body?.querySelectorAll('table').forEach((el) => {
+      // remove the footer and the ads
+      if (!el.textContent || el.textContent.length < 20 || isFooter) {
+        el.remove()
+      } else {
+        // removes the first few rows of the table (the header)
+        // remove the last two rows of the table (they are ads)
+        el.querySelectorAll('tr').forEach((tr, i) => {
+          if (i <= 7 || i >= el.querySelectorAll('tr').length - 2) {
+            console.log('removing', tr)
+            tr.remove()
+          }
+        })
+        // replace the table with a div
+        const div = dom.createElement('div')
+        div.innerHTML = el.innerHTML
+        el.parentNode?.replaceChild(div, el)
+        // set the isFooter flag to true because the next table is the footer
+        isFooter = true
+      }
+    })
+
+    return Promise.resolve(dom)
+  }
+}
--- a/examples/omnivore/api/content-handler/src/newsletters/beehiiv-handler.ts
+++ b/examples/omnivore/api/content-handler/src/newsletters/beehiiv-handler.ts
@@ -0,0 +1,24 @@
+import { ContentHandler } from '../content-handler'
+
+export class BeehiivHandler extends ContentHandler {
+  constructor() {
+    super()
+    this.name = 'beehiiv'
+  }
+
+  async isNewsletter(input: {
+    from: string
+    headers: Record<string, string | string[]>
+  }): Promise<boolean> {
+    return Promise.resolve(
+      input.headers['x-beehiiv-type']?.toString() === 'newsletter'
+    )
+  }
+
+  async parseNewsletterUrl(
+    headers: Record<string, string | string[]>,
+    html: string
+  ): Promise<string | undefined> {
+    return Promise.resolve(headers['x-newsletter']?.toString())
+  }
+}
--- a/examples/omnivore/api/content-handler/src/newsletters/bloomberg-newsletter-handler.ts
+++ b/examples/omnivore/api/content-handler/src/newsletters/bloomberg-newsletter-handler.ts
@@ -0,0 +1,37 @@
+import { ContentHandler } from '../content-handler'
+
+export class BloombergNewsletterHandler extends ContentHandler {
+  constructor() {
+    super()
+    this.senderRegex = /<.+@mail.bloomberg.*.com>/
+    this.urlRegex = /<a class="view-in-browser__url" href=["']([^"']*)["']/
+    this.name = 'bloomberg'
+  }
+
+  shouldPreParse(url: string, dom: Document): boolean {
+    const host = this.name + '.com'
+    // check if url ends with bloomberg.com
+    return (
+      new URL(url).hostname.endsWith(host) ||
+      dom.querySelector('.logo-image')?.getAttribute('alt')?.toLowerCase() ===
+        this.name
+    )
+  }
+
+  async preParse(url: string, dom: Document): Promise<Document> {
+    const body = dom.querySelector('.wrapper')
+
+    // this removes header
+    body?.querySelector('.sailthru-variables')?.remove()
+    body?.querySelector('.preview-text')?.remove()
+    body?.querySelector('.logo-wrapper')?.remove()
+    body?.querySelector('.by-the-number-wrapper')?.remove()
+    // this removes footer
+    body?.querySelector('.quote-box-wrapper')?.remove()
+    body?.querySelector('.header-wrapper')?.remove()
+    body?.querySelector('.component-wrapper')?.remove()
+    body?.querySelector('.footer')?.remove()
+
+    return Promise.resolve(dom)
+  }
+}
--- a/examples/omnivore/api/content-handler/src/newsletters/convertkit-handler.ts
+++ b/examples/omnivore/api/content-handler/src/newsletters/convertkit-handler.ts
@@ -0,0 +1,52 @@
+import { ContentHandler } from '../content-handler'
+
+export class ConvertkitHandler extends ContentHandler {
+  constructor() {
+    super()
+    this.name = 'convertkit'
+  }
+
+  findNewsletterHeaderHref(dom: Document): string | undefined {
+    const readOnline = dom.querySelectorAll('a')
+    let res: string | undefined = undefined
+    readOnline.forEach((e) => {
+      if (
+        e.textContent === 'View this email in your browser' ||
+        e.textContent === 'Read on FS'
+      ) {
+        res = e.getAttribute('href') || undefined
+      }
+    })
+    return res
+  }
+
+  async isNewsletter(input: {
+    from: string
+    dom: Document
+    headers: Record<string, string | string[]>
+  }): Promise<boolean> {
+    const dom = input.dom
+    const icons = dom.querySelectorAll(
+      'img[src*="convertkit.com"], img[src*="convertkit-mail"]'
+    )
+    if (icons.length === 0) {
+      return Promise.resolve(false)
+    }
+    // ignore newsletters that have a confirmation link to the newsletter in the body
+    const links = dom.querySelectorAll(
+      'a[href*="convertkit.com"], a[href*="convertkit-mail"]'
+    )
+    const isConfirmation = Array.from(links).some((e) => {
+      return e.textContent === 'Confirm your subscription'
+    })
+
+    return Promise.resolve(!isConfirmation)
+  }
+
+  async parseNewsletterUrl(
+    headers: Record<string, string | string[]>,
+    html: string
+  ): Promise<string | undefined> {
+    return this.findNewsletterUrl(html)
+  }
+}
--- a/examples/omnivore/api/content-handler/src/newsletters/cooper-press-handler.ts
+++ b/examples/omnivore/api/content-handler/src/newsletters/cooper-press-handler.ts
@@ -0,0 +1,37 @@
+import { ContentHandler } from '../content-handler'
+
+export class CooperPressHandler extends ContentHandler {
+  constructor() {
+    super()
+    this.name = 'cooper-press'
+  }
+
+  findNewsletterHeaderHref(dom: Document): string | undefined {
+    const readOnline = dom.querySelectorAll('a')
+    let res: string | undefined = undefined
+    readOnline.forEach((e) => {
+      if (e.textContent === 'Read on the Web') {
+        res = e.getAttribute('href') || undefined
+      }
+    })
+    return res
+  }
+
+  async isNewsletter(input: {
+    from: string
+    dom: Document
+    headers: Record<string, string | string[]>
+  }): Promise<boolean> {
+    const dom = input.dom
+    return Promise.resolve(
+      dom.querySelectorAll('a[href*="cooperpress.com"]').length > 0
+    )
+  }
+
+  async parseNewsletterUrl(
+    headers: Record<string, string | string[]>,
+    html: string
+  ): Promise<string | undefined> {
+    return this.findNewsletterUrl(html)
+  }
+}
--- a/examples/omnivore/api/content-handler/src/newsletters/energy-world.ts
+++ b/examples/omnivore/api/content-handler/src/newsletters/energy-world.ts
@@ -0,0 +1,44 @@
+import { ContentHandler } from '../content-handler'
+
+export class EnergyWorldHandler extends ContentHandler {
+  constructor() {
+    super()
+    this.name = 'Energy World'
+  }
+
+  async isNewsletter(input: {
+    from: string
+    html: string
+    headers: Record<string, string | string[]>
+    dom: Document
+  }): Promise<boolean> {
+    return Promise.resolve(
+      input.from === 'ETEnergyworld Latest News<newsletter@etenergyworld.com>'
+    )
+  }
+
+  shouldPreParse(url: string, dom: Document): boolean {
+    return dom.querySelectorAll('img[src*="etenergyworld.png"]').length > 0
+  }
+
+  async preParse(url: string, dom: Document): Promise<Document> {
+    // get the main content
+    const main = dom.querySelector('table[class="nletter-wrap"]')
+    if (!main) {
+      return Promise.resolve(dom)
+    }
+
+    // create a new dom
+    const newDom = dom.createDocumentFragment()
+
+    // add the content to the new dom
+    main.querySelectorAll('table[class="multi-cols"] tr').forEach((tr) => {
+      const p = dom.createElement('p')
+      p.innerHTML = tr.innerHTML
+      newDom.appendChild(p)
+    })
+    dom.body.replaceChildren(newDom)
+
+    return Promise.resolve(dom)
+  }
+}
--- a/examples/omnivore/api/content-handler/src/newsletters/every-io-handler.ts
+++ b/examples/omnivore/api/content-handler/src/newsletters/every-io-handler.ts
@@ -0,0 +1,22 @@
+import { ContentHandler } from '../content-handler'
+
+export class EveryIoHandler extends ContentHandler {
+  constructor() {
+    super()
+    this.name = 'Every.io'
+  }
+
+  async isNewsletter(input: {
+    from: string
+    html: string
+    headers: Record<string, string | string[]>
+    dom: Document
+  }): Promise<boolean> {
+    return Promise.resolve(input.from === 'Every <hello@every.to>')
+  }
+
+  findNewsletterHeaderHref(dom: Document): string | undefined {
+    const readOnline = dom.querySelector('.newsletter-email .title a')
+    return readOnline?.getAttribute('href') || undefined
+  }
+}
--- a/examples/omnivore/api/content-handler/src/newsletters/generic-handler.ts
+++ b/examples/omnivore/api/content-handler/src/newsletters/generic-handler.ts
@@ -0,0 +1,49 @@
+import { ContentHandler } from '../content-handler'
+import addressparser from 'addressparser'
+
+export class GenericHandler extends ContentHandler {
+  // newsletter url text regex for newsletters that don't have a newsletter header
+  NEWSLETTER_URL_TEXT_REGEX =
+    /((View|Read)(.*)(email|post)?(.*)(in your browser|online|on (FS|the Web))|Lire en ligne)/i
+
+  constructor() {
+    super()
+    this.name = 'Generic Newsletter'
+  }
+
+  async isNewsletter(input: {
+    from: string
+    html: string
+    headers: Record<string, string | string[]>
+    dom: Document
+  }): Promise<boolean> {
+    const postHeader = input.headers['list-post'] || input.headers['list-id']
+    const unSubHeader = input.headers['list-unsubscribe']
+    return Promise.resolve(!!postHeader || !!unSubHeader)
+  }
+
+  findNewsletterHeaderHref(dom: Document): string | undefined {
+    const readOnline = dom.querySelectorAll('a')
+    let res: string | undefined = undefined
+    readOnline.forEach((e) => {
+      if (e.textContent && this.NEWSLETTER_URL_TEXT_REGEX.test(e.textContent)) {
+        res = e.getAttribute('href') || undefined
+      }
+    })
+    return res
+  }
+
+  async parseNewsletterUrl(
+    headers: Record<string, string | string[]>,
+    html: string
+  ): Promise<string | undefined> {
+    // raw SubStack newsletter url is like <https://hongbo130.substack.com/p/tldr>
+    // we need to get the real url from the raw url
+    const postHeader = headers['list-post']?.toString()
+    if (postHeader && addressparser(postHeader).length > 0) {
+      return addressparser(postHeader)[0].name
+    }
+
+    return this.findNewsletterUrl(html)
+  }
+}
--- a/examples/omnivore/api/content-handler/src/newsletters/ghost-handler.ts
+++ b/examples/omnivore/api/content-handler/src/newsletters/ghost-handler.ts
@@ -0,0 +1,31 @@
+import { ContentHandler } from '../content-handler'
+
+export class GhostHandler extends ContentHandler {
+  constructor() {
+    super()
+    this.name = 'ghost'
+  }
+
+  findNewsletterHeaderHref(dom: Document): string | undefined {
+    const readOnline = dom.querySelector('.view-online-link')
+    return readOnline?.getAttribute('href') || undefined
+  }
+
+  async isNewsletter(input: {
+    from: string
+    dom: Document
+    headers: Record<string, string | string[]>
+  }): Promise<boolean> {
+    const dom = input.dom
+    return Promise.resolve(
+      dom.querySelectorAll('img[src*="ghost.org"]').length > 0
+    )
+  }
+
+  async parseNewsletterUrl(
+    headers: Record<string, string | string[]>,
+    html: string
+  ): Promise<string | undefined> {
+    return this.findNewsletterUrl(html)
+  }
+}
--- a/examples/omnivore/api/content-handler/src/newsletters/golang-handler.ts
+++ b/examples/omnivore/api/content-handler/src/newsletters/golang-handler.ts
@@ -0,0 +1,27 @@
+import { ContentHandler } from '../content-handler'
+
+export class GolangHandler extends ContentHandler {
+  constructor() {
+    super()
+    this.senderRegex = /<.+@golangweekly.com>/
+    this.urlRegex = /<a href=["']([^"']*)["'].*>Read on the Web<\/a>/
+    this.name = 'golangweekly'
+  }
+
+  shouldPreParse(url: string, dom: Document): boolean {
+    const host = this.name + '.com'
+    // check if url ends with golangweekly.com
+    return new URL(url).hostname.endsWith(host)
+  }
+
+  async preParse(url: string, dom: Document): Promise<Document> {
+    const body = dom.querySelector('body')
+
+    // this removes the "Subscribe" button
+    body?.querySelector('.el-splitbar')?.remove()
+    // this removes the title
+    body?.querySelector('.el-masthead')?.remove()
+
+    return Promise.resolve(dom)
+  }
+}
--- a/examples/omnivore/api/content-handler/src/newsletters/hey-world-handler.ts
+++ b/examples/omnivore/api/content-handler/src/newsletters/hey-world-handler.ts
@@ -0,0 +1,27 @@
+import { ContentHandler } from '../content-handler'
+
+export class HeyWorldHandler extends ContentHandler {
+  constructor() {
+    super()
+    this.name = 'hey-world'
+    this.senderRegex = /<.+@world.hey.com>/
+  }
+
+  findNewsletterHeaderHref(dom: Document): string | undefined {
+    const readOnline = dom.querySelectorAll('a')
+    let res: string | undefined = undefined
+    readOnline.forEach((e) => {
+      if (e.textContent === 'View this post online') {
+        res = e.getAttribute('href') || undefined
+      }
+    })
+    return res
+  }
+
+  async parseNewsletterUrl(
+    headers: Record<string, string | string[]>,
+    html: string
+  ): Promise<string | undefined> {
+    return this.findNewsletterUrl(html)
+  }
+}
--- a/examples/omnivore/api/content-handler/src/newsletters/india-times-handler.ts
+++ b/examples/omnivore/api/content-handler/src/newsletters/india-times-handler.ts
@@ -0,0 +1,33 @@
+import { ContentHandler } from '../content-handler'
+import addressparser from 'addressparser'
+
+export class IndiaTimesHandler extends ContentHandler {
+  constructor() {
+    super()
+    this.name = 'India Times'
+  }
+
+  async isNewsletter(input: {
+    from: string
+    html: string
+    headers: Record<string, string | string[]>
+    dom: Document
+  }): Promise<boolean> {
+    return Promise.resolve(
+      addressparser(input.from).some(
+        (e) => e.address === 'newsletters@timesofindia.com'
+      )
+    )
+  }
+
+  findNewsletterHeaderHref(dom: Document): string | undefined {
+    const readOnline = dom.querySelectorAll('a')
+    let res: string | undefined = undefined
+    readOnline.forEach((e) => {
+      if (e.textContent === 'view in browser') {
+        res = e.getAttribute('href') || undefined
+      }
+    })
+    return res
+  }
+}
--- a/examples/omnivore/api/content-handler/src/newsletters/morning-brew-handler.ts
+++ b/examples/omnivore/api/content-handler/src/newsletters/morning-brew-handler.ts
@@ -0,0 +1,35 @@
+import { ContentHandler } from '../content-handler'
+
+export class MorningBrewHandler extends ContentHandler {
+  constructor() {
+    super()
+    this.senderRegex = /Morning Brew <crew@morningbrew.com>/
+    this.urlRegex = /<a.* href=["']([^"']*)["'].*>View Online<\/a>/
+    this.name = 'morningbrew'
+  }
+
+  shouldPreParse(url: string, dom: Document): boolean {
+    const host = this.name + '.com'
+    // check if url ends with morningbrew.com
+    return new URL(url).hostname.endsWith(host)
+  }
+
+  async preParse(url: string, dom: Document): Promise<Document> {
+    // retain the width of the cells in the table of market info
+    dom.querySelectorAll('.markets-arrow-cell').forEach((td) => {
+      const table = td.closest('table')
+      if (table) {
+        const bubbleTable = table.querySelector('.markets-bubble')
+        if (bubbleTable) {
+          // replace the nested table with the text
+          const e = bubbleTable.querySelector('.markets-table-text')
+          e && bubbleTable.parentNode?.replaceChild(e, bubbleTable)
+        }
+        // set custom class for the table
+        table.className = 'morning-brew-markets'
+      }
+    })
+
+    return Promise.resolve(dom)
+  }
+}
--- a/examples/omnivore/api/content-handler/src/newsletters/revue-handler.ts
+++ b/examples/omnivore/api/content-handler/src/newsletters/revue-handler.ts
@@ -0,0 +1,44 @@
+import { ContentHandler } from '../content-handler'
+
+export class RevueHandler extends ContentHandler {
+  constructor() {
+    super()
+    this.name = 'revue'
+  }
+
+  findNewsletterHeaderHref(dom: Document): string | undefined {
+    const viewOnline = dom.querySelectorAll('table tr td a[target="_blank"]')
+    let res: string | undefined = undefined
+    viewOnline.forEach((e) => {
+      if (e.textContent === 'View online') {
+        res = e.getAttribute('href') || undefined
+      }
+    })
+    return res
+  }
+
+  async isNewsletter(input: {
+    from: string
+    dom: Document
+    headers: Record<string, string | string[]>
+  }): Promise<boolean> {
+    const dom = input.dom
+    if (
+      dom.querySelectorAll('img[src*="getrevue.co"], img[src*="revue.email"]')
+        .length > 0
+    ) {
+      const getrevueUrl = this.findNewsletterHeaderHref(dom)
+      if (getrevueUrl) {
+        return Promise.resolve(true)
+      }
+    }
+    return false
+  }
+
+  async parseNewsletterUrl(
+    headers: Record<string, string | string[]>,
+    html: string
+  ): Promise<string | undefined> {
+    return this.findNewsletterUrl(html)
+  }
+}
--- a/examples/omnivore/api/content-handler/src/newsletters/substack-handler.ts
+++ b/examples/omnivore/api/content-handler/src/newsletters/substack-handler.ts
@@ -0,0 +1,139 @@
+import addressparser from 'addressparser'
+import { ContentHandler } from '../content-handler'
+
+export class SubstackHandler extends ContentHandler {
+  constructor() {
+    super()
+    this.name = 'substack'
+  }
+
+  shouldPreParse(url: string, dom: Document): boolean {
+    const host = this.name + '.com'
+    const cdnHost = 'substackcdn.com'
+    // check if url ends with substack.com
+    // or has a profile image hosted at substack.com or substackcdn.com
+    return (
+      new URL(url).hostname.endsWith(host) ||
+      !!dom
+        .querySelector('.email-body img')
+        ?.getAttribute('src')
+        ?.includes(host || cdnHost)
+    )
+  }
+
+  async preParse(url: string, dom: Document): Promise<Document> {
+    const body = dom.querySelector('.email-body-container')
+
+    // this removes header and profile avatar
+    body?.querySelector('.header')?.remove()
+    body?.querySelector('.preamble')?.remove()
+    body?.querySelector('.meta-author-wrap')?.remove()
+    // this removes meta button
+    body?.querySelector('.post-meta')?.remove()
+    // this removes footer
+    body?.querySelector('.post-cta')?.remove()
+    body?.querySelector('.container-border')?.remove()
+    body?.querySelector('.footer')?.remove()
+    // this removes the "restack" button
+    body?.querySelector('.email-ufi-2-bottom')?.remove()
+    // this removes the "share" button
+    body?.querySelector('.email-ufi-2-top')?.remove()
+
+    dom = this.fixupStaticTweets(dom)
+
+    return Promise.resolve(dom)
+  }
+
+  findNewsletterHeaderHref(dom: Document): string | undefined {
+    // Substack header links
+    const postLink = dom.querySelector('h1 a')
+    if (postLink) {
+      return postLink.getAttribute('href') || undefined
+    }
+
+    return undefined
+  }
+
+  async isNewsletter({
+    headers,
+    dom,
+  }: {
+    from: string
+    headers: Record<string, string | string[]>
+    dom: Document
+  }): Promise<boolean> {
+    if (headers['list-post']) {
+      return Promise.resolve(true)
+    }
+    // substack newsletter emails have tables with a *post-meta class
+    if (dom.querySelector('table[class$="post-meta"]')) {
+      return true
+    }
+    // If the article has a header link, and substack icons its probably a newsletter
+    const href = this.findNewsletterHeaderHref(dom)
+    const oldHeartIcon = dom.querySelector(
+      'table tbody td span a img[src*="HeartIcon"]'
+    )
+    const oldRecommendIcon = dom.querySelector(
+      'table tbody td span a img[src*="RecommendIconRounded"]'
+    )
+    const heartIcon = dom.querySelector('a img[src*="LucideHeart"]')
+    const commentsIcon = dom.querySelector('a img[src*="LucideComments"]')
+    return Promise.resolve(
+      !!(
+        href &&
+        (oldHeartIcon || oldRecommendIcon || heartIcon || commentsIcon)
+      )
+    )
+  }
+
+  async parseNewsletterUrl(
+    headers: Record<string, string | string[]>,
+    html: string
+  ): Promise<string | undefined> {
+    // raw SubStack newsletter url is like <https://hongbo130.substack.com/p/tldr>
+    // we need to get the real url from the raw url
+    const postHeader = headers['list-post']?.toString()
+    if (postHeader && addressparser(postHeader).length > 0) {
+      return Promise.resolve(addressparser(postHeader)[0].name)
+    }
+    return this.findNewsletterUrl(html)
+  }
+
+  fixupStaticTweets(dom: Document): Document {
+    const preClassName = '_omnivore-static-'
+    const staticTweets = dom.querySelectorAll('div[class="tweet static"]')
+
+    if (staticTweets.length < 1) {
+      return dom
+    }
+
+    const recurse = (node: Element, f: (node: Element) => void) => {
+      for (let i = 0; i < node.children.length; i++) {
+        const child = node.children[i]
+        recurse(child, f)
+        f(child)
+      }
+    }
+
+    for (const tweet of Array.from(staticTweets)) {
+      tweet.className = preClassName + 'tweet'
+      tweet.removeAttribute('style')
+
+      // get all children, rename their class, remove style
+      // elements (style will be handled in the reader)
+      recurse(tweet, (n: Element) => {
+        const className = n.className
+        if (
+          className.startsWith('tweet-') ||
+          className.startsWith('quote-tweet')
+        ) {
+          n.className = preClassName + className
+        }
+        n.removeAttribute('style')
+      })
+    }
+
+    return dom
+  }
+}
--- a/examples/omnivore/api/content-handler/src/redis.ts
+++ b/examples/omnivore/api/content-handler/src/redis.ts
@@ -0,0 +1,32 @@
+import { createClient } from 'redis'
+
+// explicitly create the return type of RedisClient
+export type RedisClient = ReturnType<typeof createClient>
+
+export const createRedisClient = async (
+  url?: string,
+  cert?: string
+): Promise<RedisClient> => {
+  const redisClient = createClient({
+    url,
+    socket: {
+      tls: url?.startsWith('rediss://'), // rediss:// is the protocol for TLS
+      cert: cert?.replace(/\\n/g, '\n'), // replace \n with new line
+      rejectUnauthorized: false, // for self-signed certs
+      connectTimeout: 10000, // 10 seconds
+      reconnectStrategy(retries: number): number | Error {
+        if (retries > 10) {
+          return new Error('Retries exhausted')
+        }
+        return 1000
+      },
+    },
+  })
+
+  redisClient.on('error', (err) => console.error('Redis Client Error', err))
+
+  await redisClient.connect()
+  console.log('Redis Client Connected:', url)
+
+  return redisClient
+}
--- a/examples/omnivore/api/content-handler/src/websites/apple-news-handler.ts
+++ b/examples/omnivore/api/content-handler/src/websites/apple-news-handler.ts
@@ -0,0 +1,31 @@
+import axios from 'axios'
+import { parseHTML } from 'linkedom'
+import { ContentHandler, PreHandleResult } from '../content-handler'
+
+export class AppleNewsHandler extends ContentHandler {
+  constructor() {
+    super()
+    this.name = 'Apple News'
+  }
+
+  shouldPreHandle(url: string): boolean {
+    const u = new URL(url)
+    return u.hostname === 'apple.news'
+  }
+
+  async preHandle(url: string): Promise<PreHandleResult> {
+    const MOBILE_USER_AGENT =
+      'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36'
+    const response = await axios.get(url, {
+      headers: { 'User-Agent': MOBILE_USER_AGENT },
+    })
+    const data = response.data as string
+    const dom = parseHTML(data).document
+    // make sure it's a valid URL by wrapping in new URL
+    const href = dom
+      .querySelector('span.click-here')
+      ?.parentElement?.getAttribute('href')
+    const u = href ? new URL(href) : undefined
+    return { url: u?.href }
+  }
+}
--- a/examples/omnivore/api/content-handler/src/websites/ars-technica-handler.ts
+++ b/examples/omnivore/api/content-handler/src/websites/ars-technica-handler.ts
@@ -0,0 +1,86 @@
+import axios from 'axios'
+import { parseHTML } from 'linkedom'
+import { ContentHandler, PreHandleResult } from '../content-handler'
+
+/**
+ * Some of the content on Ars Technica is split over several pages.
+ * If this is the case we should unfurl the entire article into one. l
+ */
+export class ArsTechnicaHandler extends ContentHandler {
+  constructor() {
+    super()
+    this.name = 'ArsTechnica'
+  }
+
+  shouldPreHandle(url: string): boolean {
+    const u = new URL(url)
+    return u.hostname.endsWith('arstechnica.com')
+  }
+
+  hasMultiplePages(document: Document): boolean {
+    return document.querySelectorAll('nav.page-numbers')?.length != 0
+  }
+
+  async grabContentFromUrl(url: string): Promise<Document> {
+    const response = await axios.get(url)
+    const data = response.data as string
+    return parseHTML(data).document
+  }
+
+  async extractArticleContentsFromLink(url: string): Promise<Document[]> {
+    const dom = await this.grabContentFromUrl(url)
+    const articleContent = dom.querySelector('[itemprop="articleBody"]')
+    return [].slice.call(articleContent?.childNodes || [])
+  }
+
+  async expandLinksAndCombine(document: Document): Promise<Document> {
+    const pageNumbers = document.querySelector('nav.page-numbers')
+    const articleBody = document.querySelector('[itemprop="articleBody"]')
+
+    if (!pageNumbers || !articleBody) {
+      // We shouldn't ever really get here, but sometimes weird things happen.
+      return document
+    }
+
+    const pageLinkNodes = pageNumbers.querySelectorAll('a')
+    // Remove the "Next" Link, as it will duplicate some content.
+    const pageLinks =
+      Array.from(pageLinkNodes)
+        ?.slice(0, pageLinkNodes.length - 1)
+        ?.map(({ href }) => href) ?? []
+
+    const pageContents = await Promise.all(
+      pageLinks.map(this.extractArticleContentsFromLink.bind(this))
+    )
+
+    for (const articleContents of pageContents) {
+      // We place all the content in a span to indicate that a page has been parsed.
+      const span = document.createElement('SPAN')
+      span.className = 'nextPageContents'
+      span.append(...articleContents)
+      articleBody.append(span)
+    }
+    pageNumbers.remove()
+
+    return document
+  }
+
+  async preHandle(url: string): Promise<PreHandleResult> {
+    // We simply retrieve the article without Javascript enabled using a GET command.
+    const dom = await this.grabContentFromUrl(url)
+    if (!this.hasMultiplePages(dom)) {
+      return {
+        content: dom.body.outerHTML,
+        title: dom.title,
+        dom,
+      }
+    }
+
+    const expandedDom = await this.expandLinksAndCombine(dom)
+    return {
+      content: expandedDom.body.outerHTML,
+      title: dom.title,
+      dom: expandedDom,
+    }
+  }
+}
--- a/examples/omnivore/api/content-handler/src/websites/bloomberg-handler.ts
+++ b/examples/omnivore/api/content-handler/src/websites/bloomberg-handler.ts
@@ -0,0 +1,41 @@
+import axios from 'axios'
+import { parseHTML } from 'linkedom'
+import { ContentHandler, PreHandleResult } from '../content-handler'
+
+export class BloombergHandler extends ContentHandler {
+  constructor() {
+    super()
+    this.name = 'Bloomberg'
+  }
+
+  shouldPreHandle(url: string): boolean {
+    const BLOOMBERG_URL_MATCH =
+      /https?:\/\/(www\.)?bloomberg.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_+.~#?&/=]*)/
+    return BLOOMBERG_URL_MATCH.test(url.toString())
+  }
+
+  async preHandle(url: string): Promise<PreHandleResult> {
+    console.log('prehandling bloomberg url', url)
+
+    try {
+      const response = await axios.get('https://app.scrapingbee.com/api/v1', {
+        params: {
+          api_key: process.env.SCRAPINGBEE_API_KEY,
+          url: url,
+          return_page_source: true,
+          block_ads: true,
+          block_resources: false,
+        },
+      })
+      const dom = parseHTML(response.data).document
+      return {
+        title: dom.title,
+        content: dom.querySelector('body')?.innerHTML,
+        url: url,
+      }
+    } catch (error) {
+      console.error('error prehandling bloomberg url', error)
+      throw error
+    }
+  }
+}
--- a/examples/omnivore/api/content-handler/src/websites/derstandard-handler.ts
+++ b/examples/omnivore/api/content-handler/src/websites/derstandard-handler.ts
@@ -0,0 +1,34 @@
+import { ContentHandler, PreHandleResult } from '../content-handler'
+import axios from 'axios'
+import { parseHTML } from 'linkedom'
+
+export class DerstandardHandler extends ContentHandler {
+  constructor() {
+    super()
+    this.name = 'Derstandard'
+  }
+
+  shouldPreHandle(url: string): boolean {
+    const u = new URL(url)
+    return u.hostname === 'www.derstandard.at'
+  }
+
+  async preHandle(url: string): Promise<PreHandleResult> {
+    const response = await axios.get(url, {
+      // set cookie to give consent to get the article
+      headers: {
+        cookie: `DSGVO_ZUSAGE_V1=true; consentUUID=2bacb9c1-1e80-4be0-9f7b-ee987cf4e7b0_6`,
+      },
+    })
+    const content = response.data as string
+
+    const dom = parseHTML(content).document
+    const titleElement = dom.querySelector('.article-title')
+    titleElement && titleElement.remove()
+
+    return {
+      content: dom.body.outerHTML,
+      title: titleElement?.textContent || undefined,
+    }
+  }
+}
--- a/examples/omnivore/api/content-handler/src/websites/github-handler.ts
+++ b/examples/omnivore/api/content-handler/src/websites/github-handler.ts
@@ -0,0 +1,44 @@
+import { ContentHandler } from '../content-handler'
+
+export class GitHubHandler extends ContentHandler {
+  constructor() {
+    super()
+    this.name = 'github'
+  }
+
+  shouldPreParse(url: string, dom: Document): boolean {
+    return new URL(url).hostname.endsWith('github.com')
+  }
+
+  async preParse(url: string, dom: Document): Promise<Document> {
+    const body = dom.querySelector('body')
+    const article = dom.querySelector('article')
+    const twitterTitle = dom.querySelector(`meta[name='twitter:title']`)
+    const linkAuthor = dom.querySelector(`span[itemprop='author']`)
+
+    if (body && article) {
+      body.replaceChildren(article)
+
+      // Attempt to set the author also. This is available on repo homepages
+      // but not on things like PRs. Ideally we want PRs and issues to have
+      // author set to the author of the PR/issue.
+      if (linkAuthor && linkAuthor.textContent) {
+        const author = dom.createElement('span')
+        author.setAttribute('rel', 'author')
+        author.innerHTML = linkAuthor.textContent
+        article.appendChild(author)
+      }
+    }
+
+    // Remove the GitHub - and repo org from the title
+    const twitterTitleContent = twitterTitle?.getAttribute('content')
+    if (twitterTitle && twitterTitleContent) {
+      twitterTitle.setAttribute(
+        'content',
+        twitterTitleContent.replace(/GitHub - (.*?)\//, '')
+      )
+    }
+
+    return Promise.resolve(dom)
+  }
+}
--- a/examples/omnivore/api/content-handler/src/websites/image-handler.ts
+++ b/examples/omnivore/api/content-handler/src/websites/image-handler.ts
@@ -0,0 +1,33 @@
+import { ContentHandler, PreHandleResult } from '../content-handler'
+
+export class ImageHandler extends ContentHandler {
+  constructor() {
+    super()
+    this.name = 'Image'
+  }
+
+  shouldPreHandle(url: string): boolean {
+    const IMAGE_URL_PATTERN = /(https?:\/\/.*\.(?:jpg|jpeg|png|webp))/i
+    return IMAGE_URL_PATTERN.test(url.toString())
+  }
+
+  async preHandle(url: string): Promise<PreHandleResult> {
+    const title = url.toString().split('/').pop() || 'Image'
+    const content = `
+      <html>
+        <head>
+          <title>${title}</title>
+          <meta property="og:image" content="${url}" />
+          <meta property="og:title" content="${title}" />
+          <meta property="og:type" content="image" />
+        </head>
+        <body>
+          <div>
+            <img src="${url}" alt="${title}">
+          </div>
+        </body>
+      </html>`
+
+    return Promise.resolve({ title, content })
+  }
+}
--- a/examples/omnivore/api/content-handler/src/websites/medium-handler.ts
+++ b/examples/omnivore/api/content-handler/src/websites/medium-handler.ts
@@ -0,0 +1,26 @@
+import { ContentHandler, PreHandleResult } from '../content-handler'
+
+export class MediumHandler extends ContentHandler {
+  constructor() {
+    super()
+    this.name = 'Medium'
+  }
+
+  shouldPreHandle(url: string): boolean {
+    const u = new URL(url)
+    return u.hostname.endsWith('medium.com')
+  }
+
+  async preHandle(url: string): Promise<PreHandleResult> {
+    console.log('prehandling medium url', url)
+
+    try {
+      const res = new URL(url)
+      res.searchParams.delete('source')
+      return Promise.resolve({ url: res.toString() })
+    } catch (error) {
+      console.error('error prehandling medium url', error)
+      throw error
+    }
+  }
+}
--- a/examples/omnivore/api/content-handler/src/websites/nitter-handler.ts
+++ b/examples/omnivore/api/content-handler/src/websites/nitter-handler.ts
@@ -0,0 +1,417 @@
+import axios from 'axios'
+import { parseHTML } from 'linkedom'
+import _, { truncate } from 'lodash'
+import { DateTime } from 'luxon'
+import { ContentHandler, PreHandleResult } from '../content-handler'
+import { createRedisClient, RedisClient } from '../redis'
+
+interface Tweet {
+  url: string
+  author: {
+    username: string
+    name: string
+    profileImageUrl: string
+  }
+  text: string
+  entities: {
+    urls: {
+      url: string
+      displayUrl: string
+    }[]
+  }
+  attachments: {
+    type: string
+    url: string
+    previewUrl: string
+  }[]
+  createdAt: string
+}
+
+export class NitterHandler extends ContentHandler {
+  // matches twitter.com and nitter.net urls
+  URL_MATCH =
+    /((twitter\.com)|(nitter\.net))\/(?:#!\/)?(\w+)\/status(?:es)?\/(\d+)(?:\/.*)?/
+  INSTANCES = [
+    { value: 'https://nitter.moomoo.me', score: 0 },
+    { value: 'https://nitter.net', score: 1 }, // the official instance
+    { value: 'https://nitter.lacontrevoie.fr', score: 2 },
+    { value: 'https://nitter.kavin.rocks', score: 3 },
+    { value: 'https://notabird.site', score: 4 },
+    { value: 'https://singapore.unofficialbird.com', score: 5 },
+    { value: 'https://nitter.fly.dev', score: 6 },
+  ]
+  REDIS_KEY = 'nitter-instances'
+
+  private instance: string
+
+  constructor() {
+    super()
+    this.name = 'Nitter'
+    this.instance = ''
+  }
+
+  async getInstances(redisClient: RedisClient) {
+    // get instances by score in ascending order
+    const instances = await redisClient.zRange(this.REDIS_KEY, '-inf', '+inf', {
+      BY: 'SCORE',
+    })
+    console.debug('instances', instances)
+
+    // if no instance is found, save the default instances
+    if (instances.length === 0) {
+      const result = await redisClient.zAdd(this.REDIS_KEY, this.INSTANCES, {
+        NX: true, // only add if the key does not exist
+      })
+      console.debug('add instances', result)
+
+      // expire the key after 1 day
+      const exp = await redisClient.expire(this.REDIS_KEY, 60 * 60 * 24)
+      console.debug('instances expire in 1 day', exp)
+
+      return this.INSTANCES.map((i) => i.value)
+    }
+
+    return instances
+  }
+
+  async incrementInstanceScore(
+    redisClient: RedisClient,
+    instance: string,
+    score = 1
+  ) {
+    await redisClient.zIncrBy(this.REDIS_KEY, score, instance)
+  }
+
+  async getTweets(username: string, tweetId: string) {
+    function authorParser(header: Element) {
+      const profileImageUrl =
+        header.querySelector('.tweet-avatar img')?.getAttribute('src') ?? ''
+      const name =
+        header.querySelector('.fullname')?.getAttribute('title') ?? ''
+      const username =
+        header.querySelector('.username')?.getAttribute('title') ?? ''
+
+      return {
+        profileImageUrl,
+        name,
+        username: username.replace('@', ''), // remove @ from username
+      }
+    }
+
+    function dateParser(date: Element) {
+      const validDateTime =
+        date.getAttribute('title')?.replace(' · ', ' ') ?? Date.now()
+
+      return new Date(validDateTime).toISOString()
+    }
+
+    function urlParser(date: Element) {
+      return date.getAttribute('href') ?? ''
+    }
+
+    function attachmentParser(attachments: Element | null) {
+      if (!attachments) return []
+
+      const photos = Array.from(attachments.querySelectorAll('img')).map(
+        (i) => ({
+          url: i.getAttribute('src') ?? '',
+          type: 'photo',
+          previewUrl: i.getAttribute('src') ?? '',
+        })
+      )
+      const videos = Array.from(attachments.querySelectorAll('video')).map(
+        (i) => ({
+          url: i.getAttribute('data-url') ?? '',
+          type: 'video',
+          previewUrl: i.getAttribute('poster') ?? '',
+        })
+      )
+
+      return [...photos, ...videos]
+    }
+
+    function parseTweet(tweet: Element): Tweet | null {
+      const header = tweet.querySelector('.tweet-header')
+      if (!header) {
+        console.error('no header found', tweet)
+        return null
+      }
+      const author = authorParser(header)
+
+      const body = tweet.querySelector('.tweet-body')
+      if (!body) {
+        console.error('no body found', tweet)
+        return null
+      }
+
+      const tweetDateElement = body.querySelector('.tweet-date a')
+      if (!tweetDateElement) {
+        console.error('no tweet date found', tweet)
+        return null
+      }
+      const createdAt = dateParser(tweetDateElement)
+      const url = urlParser(tweetDateElement)
+
+      const content = body.querySelector('.tweet-content')
+      if (!content) {
+        console.error('no content found', tweet)
+        return null
+      }
+      const text = content.textContent ?? ''
+      const urls = Array.from(content.querySelectorAll('a')).map((a) => ({
+        url: a.getAttribute('href') ?? '',
+        displayUrl: a.textContent ?? '',
+      }))
+
+      const attachments = attachmentParser(body.querySelector('.attachments'))
+
+      return {
+        author,
+        createdAt,
+        text,
+        url,
+        entities: {
+          urls,
+        },
+        attachments,
+      }
+    }
+
+    const redisClient = await createRedisClient(
+      process.env.REDIS_URL,
+      process.env.REDIS_CERT
+    )
+
+    try {
+      const tweets: Tweet[] = []
+      const option = {
+        timeout: 20000, // 20 seconds
+      }
+      let html = ''
+      // get instances from redis
+      const instances = await this.getInstances(redisClient)
+      for (const instance of instances) {
+        try {
+          const url = `${instance}/${username}/status/${tweetId}`
+          const startTime = Date.now()
+          const response = await axios.get(url, option)
+          const latency = Math.floor(Date.now() - startTime)
+          console.debug('latency', latency)
+
+          html = response.data as string
+          this.instance = instance
+
+          await this.incrementInstanceScore(redisClient, instance, latency)
+          break
+        } catch (error) {
+          await this.incrementInstanceScore(
+            redisClient,
+            instance,
+            option.timeout
+          )
+
+          if (axios.isAxiosError(error)) {
+            console.info(`Error getting tweets from ${instance}`, error.message)
+          } else {
+            console.info(`Error getting tweets from ${instance}`, error)
+          }
+        }
+      }
+      if (!this.instance || !html) {
+        console.error('no instance or html found')
+        return []
+      }
+
+      const document = parseHTML(html).document
+
+      // get the main thread including tweets and threads
+      const mainThread = document.querySelector('.main-thread')
+      if (!mainThread) {
+        console.error('no main thread found')
+        return []
+      }
+      const timelineItems = Array.from(
+        mainThread.querySelectorAll('.timeline-item')
+      )
+      if (timelineItems.length === 0) {
+        console.error('no timeline items found')
+        return []
+      }
+      for (let i = 0; i < timelineItems.length; i++) {
+        const item = timelineItems[i]
+        const classList = item.classList
+        // skip unavailable tweets and earlier replies
+        if (
+          classList.contains('unavailable') ||
+          classList.contains('earlier-replies')
+        ) {
+          console.info('skip unavailable tweets and earlier replies')
+          continue
+        }
+        // if there are more replies, get them
+        if (classList.contains('more-replies')) {
+          const newUrl = item.querySelector('a')?.getAttribute('href')
+          if (!newUrl) {
+            console.error('no new url', newUrl)
+            break
+          }
+
+          let html = ''
+          try {
+            // go to new url and wait for it to load
+            const response = await axios.get(
+              `${this.instance}${newUrl}`,
+              option
+            )
+
+            html = response.data as string
+          } catch (error) {
+            console.error('Error getting tweets', error)
+            break
+          }
+
+          const document = parseHTML(html).document
+          const nextThread = document.querySelector('.main-thread .after-tweet')
+          if (!nextThread) {
+            console.error('no next thread found')
+            break
+          }
+
+          // get the new timeline items and add them to the list
+          const newTimelineItems = Array.from(
+            nextThread.querySelectorAll('.timeline-item')
+          )
+
+          timelineItems.push(...newTimelineItems)
+          continue
+        }
+
+        const tweet = parseTweet(item)
+        // filter out replies
+        if (
+          tweet &&
+          tweet.author.username.toLowerCase() === username.toLowerCase()
+        ) {
+          tweets.push(tweet)
+        }
+      }
+
+      return tweets
+    } catch (error) {
+      console.error('Error getting tweets', error)
+
+      return []
+    } finally {
+      await redisClient?.quit()
+    }
+  }
+
+  parseTweetUrl = (url: string) => {
+    const match = url.match(this.URL_MATCH)
+    return {
+      domain: match?.[1],
+      username: match?.[4],
+      tweetId: match?.[5],
+    }
+  }
+
+  titleForTweet = (author: { name: string }, text: string) => {
+    return `${author.name} on Twitter: ${truncate(text.replace(/http\S+/, ''), {
+      length: 100,
+    })}`
+  }
+
+  formatTimestamp = (timestamp: string) => {
+    return DateTime.fromJSDate(new Date(timestamp)).toLocaleString(
+      DateTime.DATETIME_FULL
+    )
+  }
+
+  shouldPreHandle(url: string): boolean {
+    return this.URL_MATCH.test(url.toString())
+  }
+
+  async preHandle(url: string): Promise<PreHandleResult> {
+    const { tweetId, username, domain } = this.parseTweetUrl(url)
+    if (!tweetId || !username || !domain) {
+      throw new Error('could not parse tweet url')
+    }
+    const tweets = await this.getTweets(username, tweetId)
+    if (tweets.length === 0) {
+      throw new Error('could not get tweets')
+    }
+
+    const tweet = tweets[0]
+    const author = tweet.author
+    // escape html entities in title
+    const title = this.titleForTweet(author, tweet.text)
+    const escapedTitle = _.escape(title)
+    const authorImage = `${this.instance}${author.profileImageUrl.replace(
+      '_normal',
+      '_400x400'
+    )}`
+    const description = _.escape(tweet.text) || escapedTitle
+    const imageDomain =
+      domain.toLowerCase() === 'twitter.com'
+        ? 'https://pbs.twimg.com'
+        : 'https://nitter.net/pic'
+
+    let tweetsContent = ''
+    for (const tweet of tweets) {
+      let text = tweet.text
+      for (const urlObj of tweet.entities.urls) {
+        text = text.replace(
+          urlObj.displayUrl,
+          `<a href="${urlObj.url}">${urlObj.displayUrl}</a>`
+        )
+      }
+
+      const includesHtml = tweet.attachments
+        .map(
+          (attachment) =>
+            `<a class="media-link" href=${imageDomain}${decodeURIComponent(
+              attachment.url
+            ).replace('/pic', '')}>
+          <picture>
+            <img class="tweet-img" src=${imageDomain}${decodeURIComponent(
+              attachment.previewUrl
+            ).replace('/pic', '')} />
+          </picture>
+          </a>`
+        )
+        .join('\n')
+
+      tweetsContent += `<p class="_omnivore_tweet_content">${text}</p>${includesHtml}`
+    }
+
+    const tweetUrl = `
+       — <a href="https://${domain}/${author.username}">${
+      author.username
+    }</a> <span itemscope itemtype="https://schema.org/Person" itemprop="author">${
+      author.name
+    }</span> <a href="${url}">${this.formatTimestamp(tweet.createdAt)}</a>`
+
+    const content = `
+      <html>
+          <head>
+            <meta property="og:image" content="${authorImage}" />
+            <meta property="og:image:secure_url" content="${authorImage}" />
+            <meta property="og:title" content="${escapedTitle}" />
+            <meta property="og:description" content="${description}" />
+            <meta property="article:published_time" content="${tweet.createdAt}" />
+            <meta property="og:site_name" content="Twitter" />
+            <meta property="og:type" content="tweet" />
+            <meta property="dc:creator" content="${author.name}" />
+            <meta property="twitter:description" content="${description}" />
+          </head>
+          <body>
+            <div class="_omnivore_twitter">
+              ${tweetsContent}
+              ${tweetUrl}
+            </div>
+          </body>
+      </html>`
+
+    return { content, url, title }
+  }
+}
--- a/examples/omnivore/api/content-handler/src/websites/pdf-handler.ts
+++ b/examples/omnivore/api/content-handler/src/websites/pdf-handler.ts
@@ -0,0 +1,18 @@
+import { ContentHandler, PreHandleResult } from '../content-handler'
+
+export class PdfHandler extends ContentHandler {
+  constructor() {
+    super()
+    this.name = 'PDF'
+  }
+
+  shouldPreHandle(url: string): boolean {
+    const u = new URL(url)
+    const path = u.pathname.replace(u.search, '')
+    return path.endsWith('.pdf')
+  }
+
+  async preHandle(url: string): Promise<PreHandleResult> {
+    return Promise.resolve({ contentType: 'application/pdf' })
+  }
+}
--- a/examples/omnivore/api/content-handler/src/websites/piped-video-handler.ts
+++ b/examples/omnivore/api/content-handler/src/websites/piped-video-handler.ts
@@ -0,0 +1,83 @@
+import axios from 'axios'
+import _ from 'underscore'
+import { ContentHandler, PreHandleResult } from '../content-handler'
+
+export class PipedVideoHandler extends ContentHandler {
+  // https://piped.video/watch?v={videoId}
+  PIPED_URL_MATCH = /^((?:https?:)?\/\/)?piped\.video\/watch\?v=[^&]+/
+
+  constructor() {
+    super()
+    this.name = 'Piped-video'
+  }
+
+  getYoutubeVideoId = (url: string) => {
+    const u = new URL(url)
+    return u.searchParams.get('v')
+  }
+
+  escapeTitle = (title: string) => {
+    return _.escape(title)
+  }
+
+  shouldPreHandle(url: string): boolean {
+    return this.PIPED_URL_MATCH.test(url.toString())
+  }
+
+  async preHandle(url: string): Promise<PreHandleResult> {
+    const videoId = this.getYoutubeVideoId(url)
+    if (!videoId) {
+      return {}
+    }
+    const baseUrl = 'https://api-piped.mha.fi'
+    const apiUrl = `${baseUrl}/streams/${videoId}`
+    const metadata = (await axios.get(apiUrl)).data as {
+      title: string
+      thumbnailUrl: string
+      uploader: string
+      uploaderUrl: string
+      uploadDate: string
+      description: string
+      videoStreams: {
+        width: number
+        height: number
+        url: string
+      }[]
+    }
+    const videoStreams = metadata.videoStreams
+    if (!videoStreams || videoStreams.length == 0) {
+      return {}
+    }
+    const videoStream = videoStreams[0]
+    const src = `https://piped.mha.fi/embed/${videoId}`
+    // escape html entities in title
+    const title = metadata.title
+    const escapedTitle = this.escapeTitle(title)
+    const ratio = videoStream.width / videoStream.height
+    const thumbnail = metadata.thumbnailUrl
+    const height = 350
+    const width = height * ratio
+    const authorName = _.escape(metadata.uploader)
+    const content = `
+    <html>
+      <head>
+        <title>${escapedTitle}</title>
+        <meta property="og:image" content="${thumbnail}" />
+        <meta property="og:image:secure_url" content="${thumbnail}" />
+        <meta property="og:title" content="${escapedTitle}" />
+        <meta property="og:description" content="${metadata.description}" />
+        <meta property="og:article:author" content="${authorName}" />
+        <meta property="og:site_name" content="Piped Video" />
+        <meta property="article:published_time" content="${metadata.uploadDate}" />
+        <meta property="og:type" content="video" />
+      </head>
+      <body>
+      <iframe width="${width}" height="${height}" src="${src}" title="${escapedTitle}" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
+        <p><a href="${url}" target="_blank">${escapedTitle}</a></p>
+        <p itemscope="" itemprop="author" itemtype="http://schema.org/Person">By <a href="https://piped.video${metadata.uploaderUrl}" target="_blank">${authorName}</a></p>
+      </body>
+    </html>`
+
+    return { content, title }
+  }
+}
--- a/examples/omnivore/api/content-handler/src/websites/scrapingBee-handler.ts
+++ b/examples/omnivore/api/content-handler/src/websites/scrapingBee-handler.ts
@@ -0,0 +1,38 @@
+import { ContentHandler, PreHandleResult } from '../content-handler'
+import axios from 'axios'
+import { parseHTML } from 'linkedom'
+
+export class ScrapingBeeHandler extends ContentHandler {
+  constructor() {
+    super()
+    this.name = 'ScrapingBee'
+  }
+
+  shouldPreHandle(url: string): boolean {
+    const u = new URL(url)
+    const hostnames = ['nytimes.com', 'news.google.com', 'fool.ca']
+
+    return hostnames.some((h) => u.hostname.endsWith(h))
+  }
+
+  async preHandle(url: string): Promise<PreHandleResult> {
+    console.log('prehandling url with scrapingbee', url)
+
+    try {
+      const response = await axios.get('https://app.scrapingbee.com/api/v1', {
+        params: {
+          api_key: process.env.SCRAPINGBEE_API_KEY,
+          url: url,
+          return_page_source: true,
+          block_ads: true,
+          block_resources: false,
+        },
+      })
+      const dom = parseHTML(response.data).document
+      return { title: dom.title, content: response.data as string, url: url }
+    } catch (error) {
+      console.error('error prehandling url w/scrapingbee', error)
+      throw error
+    }
+  }
+}
--- a/examples/omnivore/api/content-handler/src/websites/stack-overflow-handler.ts
+++ b/examples/omnivore/api/content-handler/src/websites/stack-overflow-handler.ts
@@ -0,0 +1,121 @@
+import { ContentHandler } from '../content-handler'
+
+export class StackOverflowHandler extends ContentHandler {
+  constructor() {
+    super()
+    this.name = 'stackoverflow'
+  }
+
+  parseText(element: Element, title: string) {
+    const newText = element.ownerDocument.createElement('div')
+    const text = element.querySelector(`div[itemprop='text']`)
+    if (text) {
+      const votes = element
+        .querySelector(`div[itemprop='upvoteCount']`)
+        ?.getAttribute('data-value')
+
+      if (votes) {
+        newText.innerHTML = `<h2>${title}: ${votes} vote${
+          votes === '1' ? '' : 's'
+        }</h2>${text.innerHTML}`
+      }
+    }
+    return newText
+  }
+
+  parseComments(element: Element) {
+    const dom = element.ownerDocument
+    const newComments = dom.createElement('div')
+
+    // comments
+    const commentsDiv = element.querySelector(`.comments`)
+    if (commentsDiv) {
+      const comments = commentsDiv.querySelectorAll(`.comment`)
+      if (comments.length > 0) {
+        newComments.innerHTML = `<h3>Comments</h3>`
+
+        comments.forEach((comment) => {
+          const author = comment.querySelector(`.comment-user`)
+          const text = comment.querySelector(`.comment-copy`)?.textContent
+          const authorHref = author?.getAttribute('href')
+          const date = comment.querySelector(`.relativetime-clean`)?.textContent
+          if (author && text && authorHref && date) {
+            const newComment = dom.createElement('p')
+            newComment.innerHTML = `<a href="${authorHref}"><b>${author.innerHTML}</b></a>: ${text} - ${date}`
+            newComments.appendChild(newComment)
+          }
+        })
+      }
+    }
+
+    return newComments
+  }
+
+  parseAuthors(element: Element) {
+    const dom = element.ownerDocument
+    const newAuthors = dom.createElement('div')
+
+    const authors = element.querySelectorAll(`.post-signature`)
+    authors.forEach((author) => {
+      const isOwner = author.classList.contains('owner')
+      const name = author.querySelector(`.user-details a`)?.textContent
+      const link = author.querySelector(`.user-details a`)?.getAttribute('href')
+      const reputation = author.querySelector(`.reputation-score`)?.textContent
+      const badges = Array.from(
+        author.querySelectorAll(`span[title*='badges']`)
+      )
+        .map((badge) => badge.getAttribute('title'))
+        .join(', ')
+      const date = author.querySelector(`.user-action-time`)?.textContent
+      if (name && link && reputation && date) {
+        const newAuthor = dom.createElement('p')
+        newAuthor.innerHTML = `<a href="${link}"><b>${name}</b></a> - ${reputation} reputation - ${
+          badges || 'no badge'
+        } - ${date}`
+        if (isOwner) {
+          const author = dom.createElement('span')
+          author.setAttribute('rel', 'author')
+          author.innerHTML = name
+          newAuthor.appendChild(author)
+        }
+        newAuthors.appendChild(newAuthor)
+      }
+    })
+
+    return newAuthors
+  }
+
+  shouldPreParse(url: string, dom: Document): boolean {
+    return new URL(url).hostname.endsWith('stackoverflow.com')
+  }
+
+  async preParse(url: string, dom: Document): Promise<Document> {
+    const mainEntity = dom.querySelector(`div[itemprop='mainEntity']`)
+    if (mainEntity) {
+      const newMainEntity = dom.createElement('div')
+      const question = mainEntity.querySelector('#question')
+      if (question) {
+        newMainEntity.appendChild(this.parseText(question, 'Question'))
+        newMainEntity.appendChild(this.parseAuthors(question))
+        newMainEntity.appendChild(this.parseComments(question))
+      }
+
+      const answersDiv = mainEntity.querySelector('#answers')
+      if (answersDiv) {
+        const answers = answersDiv.querySelectorAll(`.answer`)
+        answers.forEach((answer) => {
+          const title = answer.classList.contains('accepted-answer')
+            ? 'Accepted Answer'
+            : 'Answer'
+          newMainEntity.appendChild(this.parseText(answer, title))
+          newMainEntity.appendChild(this.parseAuthors(answer))
+          newMainEntity.appendChild(this.parseComments(answer))
+        })
+      }
+
+      dom.body.replaceChildren(newMainEntity)
+    }
+
+    return Promise.resolve(dom)
+  }
+}
--- a/examples/omnivore/api/content-handler/src/websites/t-dot-co-handler.ts
+++ b/examples/omnivore/api/content-handler/src/websites/t-dot-co-handler.ts
@@ -0,0 +1,26 @@
+import { ContentHandler } from '../content-handler'
+import axios from 'axios'
+
+export class TDotCoHandler extends ContentHandler {
+  constructor() {
+    super()
+    this.name = 't.co'
+  }
+
+  shouldResolve(url: string): boolean {
+    const T_DOT_CO_URL_MATCH = /^https:\/\/(?:www\.)?t\.co\/.*$/
+    return T_DOT_CO_URL_MATCH.test(url)
+  }
+
+  async resolve(url: string) {
+    return axios
+      .get(url, { maxRedirects: 0, validateStatus: null })
+      .then((res) => {
+        return new URL(res.headers.location).href
+      })
+      .catch((err) => {
+        console.log('err with t.co url', err)
+        return undefined
+      })
+  }
+}
--- a/examples/omnivore/api/content-handler/src/websites/the-atlantic-handler.ts
+++ b/examples/omnivore/api/content-handler/src/websites/the-atlantic-handler.ts
@@ -0,0 +1,59 @@
+import axios from 'axios'
+import { parseHTML } from 'linkedom'
+import { ContentHandler, PreHandleResult } from '../content-handler'
+
+export class TheAtlanticHandler extends ContentHandler {
+  constructor() {
+    super()
+    this.name = 'The Atlantic'
+  }
+
+  shouldPreHandle(url: string): boolean {
+    const u = new URL(url)
+    return u.hostname.endsWith('theatlantic.com')
+  }
+
+  removeRelatedContentLinks(articleContent: Element): Node[] {
+    const content = Array.from(articleContent.children)
+    return content.filter(
+      (paragraph) => !paragraph.className.startsWith('ArticleRelated')
+    )
+  }
+
+  unfurlContent(content: Document): Document {
+    const articleContentSection = content.querySelector(
+      '[data-event-module="article body"]'
+    )
+
+    // Remove the audio player.
+    content.querySelector('[data-event-module="audio player"]')?.remove()
+
+    if (!articleContentSection) {
+      return content
+    }
+
+    const articleContent = this.removeRelatedContentLinks(articleContentSection)
+    const divOverArticle = content.createElement('div')
+    divOverArticle.setAttribute('id', 'prehandled')
+    articleContent.forEach((it) => divOverArticle.appendChild(it))
+
+    content.insertBefore(divOverArticle, articleContentSection)
+    articleContentSection.remove()
+
+    return content
+  }
+
+  async preHandle(url: string): Promise<PreHandleResult> {
+    // We simply retrieve the article without Javascript enabled using a GET command.
+    const response = await axios.get(url)
+    const data = response.data as string
+    const dom = parseHTML(data).document
+    const editedDom = this.unfurlContent(dom)
+
+    return {
+      content: editedDom.body.outerHTML,
+      title: dom.title,
+      dom: editedDom,
+    }
+  }
+}
--- a/examples/omnivore/api/content-handler/src/websites/twitter-handler.ts
+++ b/examples/omnivore/api/content-handler/src/websites/twitter-handler.ts
@@ -0,0 +1,388 @@
+import axios from 'axios'
+import { truncate } from 'lodash'
+import { DateTime } from 'luxon'
+import { Browser, BrowserContext } from 'puppeteer-core'
+import _ from 'underscore'
+import { ContentHandler, PreHandleResult } from '../content-handler'
+
+interface TweetIncludes {
+  users: {
+    id: string
+    name: string
+    profile_image_url: string
+    username: string
+  }[]
+  media?: {
+    preview_image_url: string
+    type: string
+    url: string
+    media_key: string
+  }[]
+}
+
+interface TweetMeta {
+  result_count: number
+}
+
+interface TweetData {
+  author_id: string
+  text: string
+  entities: {
+    urls: {
+      url: string
+      expanded_url: string
+      display_url: string
+    }[]
+  }
+  created_at: string
+  referenced_tweets: {
+    type: string
+    id: string
+  }[]
+  conversation_id: string
+  attachments?: {
+    media_keys: string[]
+  }
+}
+
+interface Tweet {
+  data: TweetData
+  includes: TweetIncludes
+}
+
+interface Tweets {
+  data: TweetData[]
+  includes: TweetIncludes
+  meta: TweetMeta
+}
+
+const TWITTER_BEARER_TOKEN = process.env.TWITTER_BEARER_TOKEN
+const TWITTER_URL_MATCH =
+  /twitter\.com\/(?:#!\/)?(\w+)\/status(?:es)?\/(\d+)(?:\/.*)?/
+const MAX_THREAD_DEPTH = 100
+
+const getTweetFields = () => {
+  const TWEET_FIELDS =
+    '&tweet.fields=attachments,author_id,conversation_id,created_at,' +
+    'entities,geo,in_reply_to_user_id,lang,possibly_sensitive,public_metrics,referenced_tweets,' +
+    'source,withheld'
+  const EXPANSIONS = '&expansions=author_id,attachments.media_keys'
+  const USER_FIELDS =
+    '&user.fields=created_at,description,entities,location,pinned_tweet_id,profile_image_url,protected,public_metrics,url,verified,withheld'
+  const MEDIA_FIELDS =
+    '&media.fields=duration_ms,height,preview_image_url,url,media_key,public_metrics,width'
+
+  return `${TWEET_FIELDS}${EXPANSIONS}${USER_FIELDS}${MEDIA_FIELDS}`
+}
+
+// unroll recent tweet thread
+const getTweetThread = async (conversationId: string): Promise<Tweets> => {
+  const BASE_ENDPOINT = 'https://api.twitter.com/2/tweets/search/recent'
+  const apiUrl = new URL(
+    BASE_ENDPOINT +
+      '?query=' +
+      encodeURIComponent(`conversation_id:${conversationId}`) +
+      getTweetFields() +
+      `&max_results=${MAX_THREAD_DEPTH}`
+  )
+
+  if (!TWITTER_BEARER_TOKEN) {
+    throw new Error('No Twitter bearer token found')
+  }
+
+  const response = await axios.get<Tweets>(apiUrl.toString(), {
+    headers: {
+      Authorization: `Bearer ${TWITTER_BEARER_TOKEN}`,
+      redirect: 'follow',
+    },
+  })
+  return response.data
+}
+
+const getTweetById = async (id: string): Promise<Tweet> => {
+  const BASE_ENDPOINT = 'https://api.twitter.com/2/tweets/'
+  const apiUrl = new URL(BASE_ENDPOINT + id + '?' + getTweetFields())
+
+  if (!TWITTER_BEARER_TOKEN) {
+    throw new Error('No Twitter bearer token found')
+  }
+
+  const response = await axios.get<Tweet>(apiUrl.toString(), {
+    headers: {
+      Authorization: `Bearer ${TWITTER_BEARER_TOKEN}`,
+      redirect: 'follow',
+    },
+  })
+
+  return response.data
+}
+
+const getTweetsByIds = async (ids: string[]): Promise<Tweets> => {
+  const BASE_ENDPOINT = 'https://api.twitter.com/2/tweets?ids='
+  const apiUrl = new URL(BASE_ENDPOINT + ids.join() + getTweetFields())
+
+  if (!TWITTER_BEARER_TOKEN) {
+    throw new Error('No Twitter bearer token found')
+  }
+
+  const response = await axios.get<Tweets>(apiUrl.toString(), {
+    headers: {
+      Authorization: `Bearer ${TWITTER_BEARER_TOKEN}`,
+      redirect: 'follow',
+    },
+  })
+
+  return response.data
+}
+
+const titleForTweet = (author: { name: string }, text: string) => {
+  return `${author.name} on Twitter: ${truncate(text.replace(/http\S+/, ''), {
+    length: 100,
+  })}`
+}
+
+const tweetIdFromStatusUrl = (url: string): string | undefined => {
+  const match = url.toString().match(TWITTER_URL_MATCH)
+  return match?.[2]
+}
+
+const formatTimestamp = (timestamp: string) => {
+  return DateTime.fromJSDate(new Date(timestamp)).toLocaleString(
+    DateTime.DATETIME_FULL
+  )
+}
+
+const getTweetsFromResponse = (response: Tweets): Tweet[] => {
+  const tweets = []
+  for (const t of response.data) {
+    const media = response.includes.media?.filter((m) =>
+      t.attachments?.media_keys?.includes(m.media_key)
+    )
+    const tweet: Tweet = {
+      data: t,
+      includes: {
+        users: response.includes.users,
+        media,
+      },
+    }
+    tweets.push(tweet)
+  }
+  return tweets
+}
+
+const getOldTweets = async (
+  browser: Browser,
+  conversationId: string,
+  username: string
+): Promise<Tweet[]> => {
+  const tweetIds = await getTweetIds(browser, conversationId, username)
+  if (tweetIds.length === 0) {
+    return []
+  }
+  const response = await getTweetsByIds(tweetIds)
+  return getTweetsFromResponse(response)
+}
+
+const getRecentTweets = async (conversationId: string): Promise<Tweet[]> => {
+  const thread = await getTweetThread(conversationId)
+  if (thread.meta.result_count === 0) {
+    return []
+  }
+  // tweets are in reverse chronological order in the thread
+  return getTweetsFromResponse(thread).reverse()
+}
+
+/**
+ * Wait for `ms` amount of milliseconds
+ * @param {number} ms
+ */
+const waitFor = (ms: number) =>
+  new Promise((resolve) => setTimeout(resolve, ms))
+
+/**
+ * Get tweets(even older than 7 days) using puppeteer
+ * @param browser
+ * @param {string} tweetId
+ * @param {string} author
+ */
+const getTweetIds = async (
+  browser: Browser,
+  tweetId: string,
+  author: string
+): Promise<string[]> => {
+  const pageURL = `https://twitter.com/${author}/status/${tweetId}`
+
+  let context: BrowserContext | undefined
+  try {
+    context = await browser.createIncognitoBrowserContext()
+    const page = await context.newPage()
+
+    // Modify this variable to control the size of viewport
+    const deviceScaleFactor = 0.2
+    const height = Math.floor(2000 / deviceScaleFactor)
+    const width = Math.floor(1700 / deviceScaleFactor)
+    await page.setViewport({ width, height, deviceScaleFactor })
+
+    await page.goto(pageURL, {
+      waitUntil: 'networkidle0',
+      timeout: 60000, // 60 seconds
+    })
+
+    return await page.evaluate(async (author) => {
+      /**
+       * Wait for `ms` amount of milliseconds
+       * @param {number} ms
+       */
+      const waitFor = (ms: number) =>
+        new Promise((resolve) => setTimeout(resolve, ms))
+
+      const ids = []
+
+      // Find the first Show thread button and click it
+      const showRepliesButton = Array.from(
+        document.querySelectorAll('div[dir]')
+      )
+        .filter(
+          (node) => node.children[0] && node.children[0].tagName === 'SPAN'
+        )
+        .find((node) => node.children[0].innerHTML === 'Show replies')
+
+      if (showRepliesButton) {
+        ;(showRepliesButton as HTMLElement).click()
+
+        await waitFor(2000)
+      }
+
+      const timeNodes = Array.from(document.querySelectorAll('time'))
+
+      for (const timeNode of timeNodes) {
+        /** @type {HTMLAnchorElement | HTMLSpanElement} */
+        const timeContainerAnchor: HTMLAnchorElement | HTMLSpanElement | null =
+          timeNode.parentElement
+        if (!timeContainerAnchor) continue
+
+        if (timeContainerAnchor.tagName === 'SPAN') continue
+
+        const href = timeContainerAnchor.getAttribute('href')
+        if (!href) continue
+
+        // Get the tweet id and username from the href: https://twitter.com/username/status/1234567890
+        const match = href.match(/\/([^/]+)\/status\/(\d+)/)
+        if (!match) continue
+
+        const id = match[2]
+        const username = match[1]
+
+        // skip non-author replies
+        username === author && ids.push(id)
+      }
+
+      return ids
+    }, author)
+  } catch (error) {
+    console.error('Error getting tweets', error)
+
+    return []
+  } finally {
+    if (context) {
+      await context.close()
+    }
+  }
+}
+
+export class TwitterHandler extends ContentHandler {
+  constructor() {
+    super()
+    this.name = 'Twitter'
+  }
+
+  shouldPreHandle(url: string): boolean {
+    return !!TWITTER_BEARER_TOKEN && TWITTER_URL_MATCH.test(url.toString())
+  }
+
+  async preHandle(url: string, browser: Browser): Promise<PreHandleResult> {
+    const tweetId = tweetIdFromStatusUrl(url)
+    if (!tweetId) {
+      throw new Error('could not find tweet id in url')
+    }
+    let tweet = await getTweetById(tweetId)
+    const conversationId = tweet.data.conversation_id
+    if (conversationId !== tweetId) {
+      // this is a reply, so we need to get the referenced tweet
+      tweet = await getTweetById(conversationId)
+    }
+
+    const tweetData = tweet.data
+    const authorId = tweetData.author_id
+    const author = tweet.includes.users.filter((u) => (u.id = authorId))[0]
+    // escape html entities in title
+    const title = titleForTweet(author, tweetData.text)
+    const escapedTitle = _.escape(title)
+    const authorImage = author.profile_image_url.replace('_normal', '_400x400')
+    const description = _.escape(tweetData.text)
+
+    // use puppeteer to get all tweet replies in the thread
+    const tweets = await getOldTweets(browser, conversationId, author.username)
+
+    let tweetsContent = ''
+    for (const tweet of tweets) {
+      const tweetData = tweet.data
+      let text = tweetData.text
+      if (tweetData.entities && tweetData.entities.urls) {
+        for (const urlObj of tweetData.entities.urls) {
+          text = text.replace(
+            urlObj.url,
+            `<a href="${urlObj.expanded_url}">${urlObj.display_url}</a>`
+          )
+        }
+      }
+
+      const includesHtml =
+        tweet.includes.media
+          ?.map((m) => {
+            const linkUrl = m.type == 'photo' ? m.url : url
+            const previewUrl = m.type == 'photo' ? m.url : m.preview_image_url
+            return `<a class="media-link" href=${linkUrl}>
+          <picture>
+            <img class="tweet-img" src=${previewUrl} />
+          </picture>
+          </a>`
+          })
+          .join('\n') ?? ''
+
+      tweetsContent += `
+      <p>${text}</p>
+      ${includesHtml}
+    `
+    }
+
+    const tweetUrl = `
+       — <a href="https://twitter.com/${author.username}">${
+      author.username
+    }</a> <span itemscope itemtype="https://schema.org/Person" itemprop="author">${
+      author.name
+    }</span> <a href="${url}">${formatTimestamp(tweetData.created_at)}</a>
+    `
+
+    const content = `
+<html>
+    <head>
+      <meta property="og:image" content="${authorImage}" />
+      <meta property="og:image:secure_url" content="${authorImage}" />
+      <meta property="og:title" content="${escapedTitle}" />
+      <meta property="og:description" content="${description}" />
+      <meta property="article:published_time" content="${tweetData.created_at}" />
+      <meta property="og:site_name" content="Twitter" />
+      <meta property="og:type" content="tweet" />
+    </head>
+    <body>
+      <div>
+        ${tweetsContent}
+        ${tweetUrl}
+      </div>
+    </body>
+</html>`
+
+    return { content, url, title }
+  }
+}
--- a/examples/omnivore/api/content-handler/src/websites/weixin-qq-handler.ts
+++ b/examples/omnivore/api/content-handler/src/websites/weixin-qq-handler.ts
@@ -0,0 +1,48 @@
+import { DateTime } from 'luxon'
+import { ContentHandler } from '../content-handler'
+
+export class WeixinQqHandler extends ContentHandler {
+  constructor() {
+    super()
+    this.name = 'Weixin QQ'
+  }
+
+  shouldPreParse(url: string, dom: Document): boolean {
+    return new URL(url).hostname.endsWith('weixin.qq.com')
+  }
+
+  async preParse(url: string, dom: Document): Promise<Document> {
+    // Retrieve the publish time
+    const publishTime = dom.querySelector('#publish_time')?.textContent
+    if (publishTime) {
+      const dateTimeFormat = 'yyyy-LL-dd HH:mm'
+      // published time is in UTC+8
+      const publishTimeISO = DateTime.fromFormat(publishTime, dateTimeFormat, {
+        zone: 'Asia/Shanghai',
+      }).toISO()
+
+      // create a meta node to store the publish time in ISO format
+      const metaNode = dom.createElement('meta')
+      metaNode.setAttribute('name', 'date')
+      metaNode.setAttribute('content', publishTimeISO)
+      dom.querySelector('head')?.appendChild(metaNode)
+    }
+    // This replace the class name of the article info to preserve the block
+    dom
+      .querySelector('.rich_media_meta_list')
+      ?.setAttribute('class', '_omnivore_rich_media_meta_list')
+
+    // This removes the title
+    dom.querySelector('.rich_media_title')?.remove()
+
+    // This removes the profile info
+    dom.querySelector('.profile_container')?.remove()
+
+    //  This removes the footer
+    dom.querySelector('#content_bottom_area')?.remove()
+    dom.querySelector('.rich_media_area_extra')?.remove()
+    dom.querySelector('#js_pc_qr_code')?.remove()
+
+    return Promise.resolve(dom)
+  }
+}
--- a/examples/omnivore/api/content-handler/src/websites/wikipedia-handler.ts
+++ b/examples/omnivore/api/content-handler/src/websites/wikipedia-handler.ts
@@ -0,0 +1,24 @@
+import { ContentHandler } from '../content-handler'
+
+export class WikipediaHandler extends ContentHandler {
+  constructor() {
+    super()
+    this.name = 'wikipedia'
+  }
+
+  shouldPreParse(url: string, dom: Document): boolean {
+    return new URL(url).hostname.endsWith('wikipedia.org')
+  }
+
+  async preParse(url: string, dom: Document): Promise<Document> {
+    // This removes the [edit] anchors from wikipedia pages
+    dom.querySelectorAll('.mw-editsection').forEach((e) => e.remove())
+
+    // Remove footnotes
+    dom.querySelectorAll('sup[class="reference"]').forEach((e) => e.remove())
+
+    // this removes the sidebar
+    dom.querySelector('.infobox')?.remove()
+    return Promise.resolve(dom)
+  }
+}
--- a/examples/omnivore/api/content-handler/src/websites/wired-handler.ts
+++ b/examples/omnivore/api/content-handler/src/websites/wired-handler.ts
@@ -0,0 +1,59 @@
+import axios from 'axios'
+import { parseHTML } from 'linkedom'
+import { ContentHandler, PreHandleResult } from '../content-handler'
+
+export class WiredHandler extends ContentHandler {
+  constructor() {
+    super()
+    this.name = 'Wired'
+  }
+
+  // We check if this is a paywalled document, as paywalled documents will have <p> tags
+  // in the body.
+  isPaywalledContent(document: Document): boolean {
+    return document.getElementsByClassName('paywall').length > 0
+  }
+
+  removeNonArticleNodes(document: Document): Document {
+    const genericCallouts = Array.from(
+      document.querySelectorAll('[data-testid="GenericCallout"]')
+    )
+    const ads = Array.from(document.querySelectorAll('.ad__slot')).map(
+      (it) => it.parentElement
+    )
+    const mostPopularArticles = Array.from(
+      document.querySelectorAll('[data-most-popular-id]')
+    )
+
+    ;[...genericCallouts, ...ads, ...mostPopularArticles].forEach((it) =>
+      it?.remove()
+    )
+
+    return document
+  }
+
+  shouldPreHandle(url: string): boolean {
+    const u = new URL(url)
+    return u.hostname.endsWith('wired.com')
+  }
+
+  async preHandle(url: string): Promise<PreHandleResult> {
+    const response = await axios.get(url)
+    const data = response.data as string
+    const dom = parseHTML(data).document
+
+    if (!this.isPaywalledContent(dom)) {
+      // This is just to ensure that the currently working articles don't break.
+      // Looking further into this, they might all have paywalls?
+      return {}
+    }
+
+    const cleanedArticleDom = this.removeNonArticleNodes(dom)
+
+    return {
+      content: cleanedArticleDom.body.outerHTML,
+      title: dom.title,
+      dom: cleanedArticleDom,
+    }
+  }
+}
--- a/examples/omnivore/api/content-handler/src/websites/youtube-handler.ts
+++ b/examples/omnivore/api/content-handler/src/websites/youtube-handler.ts
@@ -0,0 +1,97 @@
+import { ContentHandler, PreHandleResult } from '../content-handler'
+import axios from 'axios'
+import _ from 'underscore'
+
+const YOUTUBE_URL_MATCH =
+  /^((?:https?:)?\/\/)?((?:www|m)\.)?((?:youtube\.com|youtu.be))(\/(?:[\w-]+\?v=|embed\/|v\/)?)([\w-]+)(\S+)?$/
+
+export const getYoutubeVideoId = (url: string) => {
+  const u = new URL(url)
+  const videoId = u.searchParams.get('v')
+  if (!videoId) {
+    const match = url.toString().match(YOUTUBE_URL_MATCH)
+    if (match === null || match.length < 6 || !match[5]) {
+      return undefined
+    }
+    return match[5]
+  }
+  return videoId
+}
+
+export const getYoutubePlaylistId = (url: string) => {
+  const u = new URL(url)
+  return u.searchParams.get('list')
+}
+
+export const escapeTitle = (title: string) => {
+  return _.escape(title)
+}
+
+export class YoutubeHandler extends ContentHandler {
+  constructor() {
+    super()
+    this.name = 'Youtube'
+  }
+
+  shouldPreHandle(url: string): boolean {
+    return YOUTUBE_URL_MATCH.test(url.toString())
+  }
+
+  async preHandle(url: string): Promise<PreHandleResult> {
+    const BaseUrl = 'https://www.youtube.com'
+    const embedBaseUrl = 'https://www.youtube.com/embed'
+    let urlToEncode: string
+    let src: string
+    const playlistId = getYoutubePlaylistId(url)
+    if (playlistId) {
+      urlToEncode = `${BaseUrl}/playlist?list=${playlistId}`
+      src = `${embedBaseUrl}/videoseries?list=${playlistId}`
+    } else {
+      const videoId = getYoutubeVideoId(url)
+      if (!videoId) {
+        return {}
+      }
+      urlToEncode = `${BaseUrl}/watch?v=${videoId}`
+      src = `${embedBaseUrl}/${videoId}`
+    }
+
+    const oembedUrl =
+      `https://www.youtube.com/oembed?format=json&url=` +
+      encodeURIComponent(urlToEncode)
+    const oembed = (await axios.get(oembedUrl.toString())).data as {
+      title: string
+      width: number
+      height: number
+      thumbnail_url: string
+      author_name: string
+      author_url: string
+    }
+    // escape html entities in title
+    const title = oembed.title
+    const escapedTitle = escapeTitle(title)
+    const ratio = oembed.width / oembed.height
+    const thumbnail = oembed.thumbnail_url
+    const height = 350
+    const width = height * ratio
+    const authorName = _.escape(oembed.author_name)
+    const content = `
+    <html>
+      <head><title>${escapedTitle}</title>
+      <meta property="og:image" content="${thumbnail}" />
+      <meta property="og:image:secure_url" content="${thumbnail}" />
+      <meta property="og:title" content="${escapedTitle}" />
+      <meta property="og:description" content="" />
+      <meta property="og:article:author" content="${authorName}" />
+      <meta property="og:site_name" content="YouTube" />
+      <meta property="og:type" content="video" />
+      </head>
+      <body>
+      <iframe width="${width}" height="${height}" src="${src}" title="${escapedTitle}" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
+        <p><a href="${url}" target="_blank">${escapedTitle}</a></p>
+        <p itemscope="" itemprop="author" itemtype="http://schema.org/Person">By <a href="${oembed.author_url}" target="_blank">${authorName}</a></p>
+      </body>
+    </html>`
+
+    return { content, title }
+  }
+}
--- a/examples/omnivore/api/content-handler/src/websites/zhihu-handler.ts
+++ b/examples/omnivore/api/content-handler/src/websites/zhihu-handler.ts
@@ -0,0 +1,117 @@
+import { ContentHandler } from '../content-handler'
+
+export class ZhihuHandler extends ContentHandler {
+  constructor() {
+    super()
+    this.name = 'zhihu'
+  }
+
+  parseQuestion(element: Element) {
+    const newQuestion = element.ownerDocument.createElement('div')
+    const question = element.querySelector(`.QuestionHeader-main`)
+    if (question) {
+      const votes = element
+        .querySelector(`div[itemprop='upvoteCount']`)
+        ?.getAttribute('data-value')
+
+      if (votes) {
+        newQuestion.innerHTML = `<h2>问题: ${votes} vote${
+          votes === '1' ? '' : 's'
+        }</h2>${question.innerHTML}`
+      }
+    }
+    return newQuestion
+  }
+
+  parseComments(element: Element) {
+    const dom = element.ownerDocument
+    const newComments = dom.createElement('div')
+
+    // comments
+    const commentsDiv = element.querySelector(`.comments`)
+    if (commentsDiv) {
+      const comments = commentsDiv.querySelectorAll(`.comment`)
+      if (comments.length > 0) {
+        newComments.innerHTML = `<h3>Comments</h3>`
+
+        comments.forEach((comment) => {
+          const author = comment.querySelector(`.comment-user`)
+          const text = comment.querySelector(`.comment-copy`)?.textContent
+          const authorHref = author?.getAttribute('href')
+          const date = comment.querySelector(`.relativetime-clean`)?.textContent
+          if (author && text && authorHref && date) {
+            const newComment = dom.createElement('p')
+            newComment.innerHTML = `<a href="${authorHref}"><b>${author.innerHTML}</b></a>: ${text} - ${date}`
+            newComments.appendChild(newComment)
+          }
+        })
+      }
+    }
+
+    return newComments
+  }
+
+  parseAuthors(element: Element) {
+    const dom = element.ownerDocument
+    const newAuthors = dom.createElement('div')
+
+    const authors = element.querySelectorAll(`.post-signature`)
+    authors.forEach((author) => {
+      const isOwner = author.classList.contains('owner')
+      const name = author.querySelector(`.user-details a`)?.textContent
+      const link = author.querySelector(`.user-details a`)?.getAttribute('href')
+      const reputation = author.querySelector(`.reputation-score`)?.textContent
+      const badges = Array.from(
+        author.querySelectorAll(`span[title*='badges']`)
+      )
+        .map((badge) => badge.getAttribute('title'))
+        .join(', ')
+      const date = author.querySelector(`.user-action-time`)?.textContent
+      if (name && link && reputation && date) {
+        const newAuthor = dom.createElement('p')
+        newAuthor.innerHTML = `<a href="${link}"><b>${name}</b></a> - ${reputation} reputation - ${
+          badges || 'no badge'
+        } - ${date}`
+        if (isOwner) {
+          const author = dom.createElement('span')
+          author.setAttribute('rel', 'author')
+          author.innerHTML = name
+          newAuthor.appendChild(author)
+        }
+        newAuthors.appendChild(newAuthor)
+      }
+    })
+
+    return newAuthors
+  }
+
+  shouldPreParse(url: string, dom: Document): boolean {
+    return new URL(url).hostname.endsWith('zhihu.com')
+  }
+
+  async preParse(url: string, dom: Document): Promise<Document> {
+    const mainEntity = dom.querySelector(`div[itemprop='mainEntity']`)
+    if (mainEntity) {
+      const newMainEntity = dom.createElement('div')
+      const question = mainEntity.querySelector('.QuestionHeader')
+      if (question) {
+        question.className = '_omnivore_zhihu_question'
+        newMainEntity.appendChild(question)
+      }
+
+      const answers = mainEntity.querySelectorAll('.ContentItem.AnswerItem')
+      answers.forEach((answer) => {
+        answer
+          .querySelector('.AuthorInfo')
+          ?.setAttribute('class', '_omnivore_zhihu_author')
+
+        answer.className = '_omnivore_zhihu_answer'
+        newMainEntity.appendChild(answer)
+      })
+
+      dom.body.replaceChildren(newMainEntity)
+    }
+
+    return Promise.resolve(dom)
+  }
+}