Files
dockerfile/examples/omnivore/api/content-handler/src/websites/wikipedia-handler.ts

25 lines
706 B
TypeScript

import { ContentHandler } from '../content-handler'
export class WikipediaHandler extends ContentHandler {
constructor() {
super()
this.name = 'wikipedia'
}
shouldPreParse(url: string, dom: Document): boolean {
return new URL(url).hostname.endsWith('wikipedia.org')
}
async preParse(url: string, dom: Document): Promise<Document> {
// This removes the [edit] anchors from wikipedia pages
dom.querySelectorAll('.mw-editsection').forEach((e) => e.remove())
// Remove footnotes
dom.querySelectorAll('sup[class="reference"]').forEach((e) => e.remove())
// this removes the sidebar
dom.querySelector('.infobox')?.remove()
return Promise.resolve(dom)
}
}