445 lines
11 KiB
TypeScript
445 lines
11 KiB
TypeScript
|
import { htmlToText } from 'html-to-text'
|
||
|
import { parseHTML } from 'linkedom'
|
||
|
import {
|
||
|
SentenceTokenizer,
|
||
|
SentenceTokenizerNew,
|
||
|
WordPunctTokenizer,
|
||
|
} from 'natural'
|
||
|
|
||
|
// this code needs to be kept in sync with the
|
||
|
// frontend code in: useReadingProgressAnchor
|
||
|
|
||
|
export interface HtmlInput {
|
||
|
title?: string
|
||
|
content: string
|
||
|
options: SSMLOptions
|
||
|
}
|
||
|
|
||
|
export interface Utterance {
|
||
|
idx: string
|
||
|
text: string
|
||
|
wordOffset: number
|
||
|
wordCount: number
|
||
|
voice: string
|
||
|
}
|
||
|
|
||
|
export interface SpeechFile {
|
||
|
wordCount: number
|
||
|
language: string
|
||
|
defaultVoice: string
|
||
|
utterances: Utterance[]
|
||
|
}
|
||
|
|
||
|
export type SSMLItem = {
|
||
|
open: string
|
||
|
close: string
|
||
|
textItems: string[]
|
||
|
idx: number
|
||
|
voice?: string
|
||
|
}
|
||
|
|
||
|
export type SSMLOptions = {
|
||
|
primaryVoice?: string
|
||
|
secondaryVoice?: string
|
||
|
rate?: string
|
||
|
language?: string
|
||
|
}
|
||
|
|
||
|
const DEFAULT_LANGUAGE = 'en-US'
|
||
|
const DEFAULT_VOICE = 'en-US-JennyNeural'
|
||
|
const DEFAULT_SECONDARY_VOICE = 'en-US-GuyNeural'
|
||
|
const DEFAULT_RATE = '1.1'
|
||
|
|
||
|
const ANCHOR_ELEMENTS_BLOCKED_ATTRIBUTES = [
|
||
|
'omnivore-highlight-id',
|
||
|
'data-twitter-tweet-id',
|
||
|
'data-instagram-id',
|
||
|
]
|
||
|
|
||
|
function ssmlTagsForTopLevelElement() {
|
||
|
return {
|
||
|
opening: `<break time="250ms"/>`,
|
||
|
}
|
||
|
}
|
||
|
|
||
|
const TOP_LEVEL_TAGS = [
|
||
|
'P',
|
||
|
'BLOCKQUOTE',
|
||
|
'H1',
|
||
|
'H2',
|
||
|
'H3',
|
||
|
'H4',
|
||
|
'H5',
|
||
|
'H6',
|
||
|
'LI',
|
||
|
]
|
||
|
|
||
|
function parseDomTree(pageNode: Element) {
|
||
|
if (!pageNode || pageNode.childNodes.length == 0) {
|
||
|
console.log('no child nodes found')
|
||
|
return []
|
||
|
}
|
||
|
|
||
|
const nodesToVisitStack = [pageNode]
|
||
|
const visitedNodeList = []
|
||
|
|
||
|
while (nodesToVisitStack.length > 0) {
|
||
|
const currentNode = nodesToVisitStack.pop()
|
||
|
if (
|
||
|
currentNode?.nodeType !== 1 /* Node.ELEMENT_NODE */ ||
|
||
|
// Avoiding dynamic elements from being counted as anchor-allowed elements
|
||
|
ANCHOR_ELEMENTS_BLOCKED_ATTRIBUTES.some((attrib) =>
|
||
|
currentNode.hasAttribute(attrib)
|
||
|
)
|
||
|
) {
|
||
|
continue
|
||
|
}
|
||
|
|
||
|
visitedNodeList.push(currentNode)
|
||
|
;[].slice
|
||
|
.call(currentNode.childNodes)
|
||
|
.reverse()
|
||
|
.forEach(function (node) {
|
||
|
nodesToVisitStack.push(node)
|
||
|
})
|
||
|
}
|
||
|
|
||
|
visitedNodeList.shift()
|
||
|
visitedNodeList.forEach((node, index) => {
|
||
|
// We start at index 3, because the frontend starts two nodes above us
|
||
|
// on the #readability-page-1 element that wraps the entire content.
|
||
|
node.setAttribute('data-omnivore-anchor-idx', (index + 3).toString())
|
||
|
})
|
||
|
return visitedNodeList
|
||
|
}
|
||
|
|
||
|
function emit(textItems: string[], text: string) {
|
||
|
textItems.push(text)
|
||
|
}
|
||
|
|
||
|
function cleanTextNode(textNode: ChildNode): string {
|
||
|
return stripEmojis(textNode.textContent ?? '')
|
||
|
}
|
||
|
|
||
|
function emitTextNode(
|
||
|
textItems: string[],
|
||
|
cleanedText: string,
|
||
|
textNode: ChildNode
|
||
|
) {
|
||
|
const ssmlElement =
|
||
|
textNode.parentNode?.nodeName === 'B' ? 'emphasis' : undefined
|
||
|
if (!cleanedText) {
|
||
|
return
|
||
|
}
|
||
|
|
||
|
if (ssmlElement) {
|
||
|
emit(textItems, `<${ssmlElement}>`)
|
||
|
}
|
||
|
emit(textItems, `${cleanedText.replace(/\s+/g, ' ')}`)
|
||
|
if (ssmlElement) {
|
||
|
emit(textItems, `</${ssmlElement}>`)
|
||
|
}
|
||
|
}
|
||
|
|
||
|
function emitElement(
|
||
|
textItems: string[],
|
||
|
element: Element,
|
||
|
isTopLevel: boolean
|
||
|
) {
|
||
|
const SKIP_TAGS = [
|
||
|
'SCRIPT',
|
||
|
'STYLE',
|
||
|
'IMG',
|
||
|
'FIGURE',
|
||
|
'FIGCAPTION',
|
||
|
'IFRAME',
|
||
|
'CODE',
|
||
|
]
|
||
|
|
||
|
const topLevelTags = ssmlTagsForTopLevelElement()
|
||
|
const idx = element.getAttribute('data-omnivore-anchor-idx')
|
||
|
let maxVisitedIdx = Number(idx)
|
||
|
|
||
|
if (isTopLevel) {
|
||
|
emit(textItems, topLevelTags.opening)
|
||
|
}
|
||
|
|
||
|
for (const child of Array.from(element.childNodes)) {
|
||
|
if (SKIP_TAGS.indexOf(child.nodeName) >= 0) {
|
||
|
continue
|
||
|
}
|
||
|
|
||
|
if (
|
||
|
child.nodeType == 3 /* Node.TEXT_NODE */ &&
|
||
|
(child.textContent?.length ?? 0) > 0
|
||
|
) {
|
||
|
const cleanedText = cleanTextNode(child)
|
||
|
if (idx && cleanedText.length > 1) {
|
||
|
// Make sure it's more than just a space
|
||
|
emit(textItems, `<bookmark mark="${idx}"/>`)
|
||
|
}
|
||
|
emitTextNode(textItems, cleanedText, child)
|
||
|
}
|
||
|
if (child.nodeType == 1 /* Node.ELEMENT_NODE */) {
|
||
|
maxVisitedIdx = emitElement(textItems, child as HTMLElement, false)
|
||
|
if (child.nodeName === 'LI') {
|
||
|
// add a new line after each list item
|
||
|
emit(textItems, '\n')
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return Number(maxVisitedIdx)
|
||
|
}
|
||
|
|
||
|
export const startSsml = (options: SSMLOptions, element?: Element): string => {
|
||
|
const voice =
|
||
|
element?.nodeName === 'BLOCKQUOTE'
|
||
|
? options.secondaryVoice ?? DEFAULT_SECONDARY_VOICE
|
||
|
: options.primaryVoice ?? DEFAULT_VOICE
|
||
|
return `<speak xmlns="http://www.w3.org/2001/10/synthesis" version="1.0" xml:lang="${
|
||
|
options.language || DEFAULT_LANGUAGE
|
||
|
}"><voice name="${voice}"><prosody rate="${options.rate || DEFAULT_RATE}">`
|
||
|
}
|
||
|
|
||
|
export const endSsml = (): string => {
|
||
|
return `</prosody></voice></speak>`
|
||
|
}
|
||
|
|
||
|
const hasSignificantText = (node: ChildNode): boolean => {
|
||
|
let text = ''
|
||
|
for (const child of Array.from(node.childNodes)) {
|
||
|
if (child.nodeType === 3 /* Node.TEXT_NODE */) {
|
||
|
text += child.textContent
|
||
|
}
|
||
|
}
|
||
|
return text.trim().length > 0
|
||
|
}
|
||
|
|
||
|
export const ssmlItemText = (item: SSMLItem): string => {
|
||
|
return [item.open, ...item.textItems, item.close].join('')
|
||
|
}
|
||
|
|
||
|
export const htmlToSsmlItems = (
|
||
|
html: string,
|
||
|
options: SSMLOptions
|
||
|
): SSMLItem[] => {
|
||
|
console.log('creating ssml with options', options)
|
||
|
|
||
|
const dom = parseHTML(html)
|
||
|
const body = dom.document.querySelector('#readability-page-1')
|
||
|
if (!body) {
|
||
|
throw new Error('Unable to parse HTML document')
|
||
|
}
|
||
|
|
||
|
const parsedNodes = parseDomTree(body)
|
||
|
if (parsedNodes.length < 1) {
|
||
|
throw new Error('No HTML nodes found')
|
||
|
}
|
||
|
|
||
|
const items: SSMLItem[] = []
|
||
|
for (let i = 3; i < parsedNodes.length + 3; i++) {
|
||
|
const textItems: string[] = []
|
||
|
const node = parsedNodes[i - 3]
|
||
|
|
||
|
if (TOP_LEVEL_TAGS.includes(node.nodeName) || hasSignificantText(node)) {
|
||
|
const idx = i
|
||
|
i = emitElement(textItems, node, true)
|
||
|
items.push({
|
||
|
open: startSsml(options, node),
|
||
|
close: endSsml(),
|
||
|
textItems: textItems,
|
||
|
idx,
|
||
|
voice:
|
||
|
node.nodeName === 'BLOCKQUOTE' ? options.secondaryVoice : undefined,
|
||
|
})
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return items
|
||
|
}
|
||
|
|
||
|
export const stripEmojis = (text: string): string => {
|
||
|
const emojiRegex =
|
||
|
/(?![*#0-9]+)[\p{Emoji}\p{Emoji_Modifier}\p{Emoji_Component}\p{Emoji_Modifier_Base}\p{Emoji_Presentation}]/gu
|
||
|
return text.replace(emojiRegex, '').replace(/\s+/g, ' ')
|
||
|
}
|
||
|
|
||
|
const textToUtterances = ({
|
||
|
wordTokenizer,
|
||
|
idx,
|
||
|
textItems,
|
||
|
wordOffset,
|
||
|
voice,
|
||
|
isHtml = true,
|
||
|
}: {
|
||
|
wordTokenizer: WordPunctTokenizer
|
||
|
idx: string
|
||
|
textItems: string[]
|
||
|
wordOffset: number
|
||
|
voice: string
|
||
|
isHtml?: boolean
|
||
|
}): Utterance[] => {
|
||
|
let text = textItems.join('')
|
||
|
if (!isHtml) {
|
||
|
// for title
|
||
|
return [
|
||
|
{
|
||
|
idx,
|
||
|
text,
|
||
|
wordOffset,
|
||
|
wordCount: wordTokenizer.tokenize(text).length,
|
||
|
voice,
|
||
|
},
|
||
|
]
|
||
|
}
|
||
|
|
||
|
const utterances: Utterance[] = []
|
||
|
try {
|
||
|
text = htmlToText(text, { wordwrap: false })
|
||
|
} catch (err) {
|
||
|
console.error('Unable to convert HTML to text')
|
||
|
text = parseHTML(text).document.documentElement.textContent ?? text
|
||
|
console.info('Converted HTML to text')
|
||
|
}
|
||
|
const MAX_CHARS = 256
|
||
|
let sentences: string[] = []
|
||
|
try {
|
||
|
// use new sentence tokenizer
|
||
|
const sentenceTokenizer = new SentenceTokenizerNew()
|
||
|
sentences = sentenceTokenizer.tokenize(text)
|
||
|
} catch (err) {
|
||
|
console.debug('Unable to tokenize sentences')
|
||
|
// fallback to old sentence tokenizer
|
||
|
const sentenceTokenizer = new SentenceTokenizer()
|
||
|
sentences = sentenceTokenizer.tokenize(text)
|
||
|
}
|
||
|
let currentText = ''
|
||
|
// split text to max 256 chars per utterance and
|
||
|
// use nlp lib to detect sentences and
|
||
|
// avoid splitting words and sentences
|
||
|
sentences.forEach((sentence, i) => {
|
||
|
if (i < sentences.length - 1) {
|
||
|
// add space to the end of sentence
|
||
|
sentence += ' '
|
||
|
}
|
||
|
const nextText = currentText + sentence
|
||
|
if (nextText.length > MAX_CHARS) {
|
||
|
if (currentText.length > 0) {
|
||
|
const wordCount = wordTokenizer.tokenize(currentText).length
|
||
|
utterances.push({
|
||
|
idx,
|
||
|
text: currentText,
|
||
|
wordOffset,
|
||
|
wordCount,
|
||
|
voice,
|
||
|
})
|
||
|
wordOffset += wordCount
|
||
|
currentText = sentence
|
||
|
} else {
|
||
|
const wordCount = wordTokenizer.tokenize(sentence).length
|
||
|
utterances.push({
|
||
|
idx,
|
||
|
text: sentence,
|
||
|
wordOffset,
|
||
|
wordCount,
|
||
|
voice,
|
||
|
})
|
||
|
wordOffset += wordCount
|
||
|
}
|
||
|
} else {
|
||
|
currentText = nextText
|
||
|
}
|
||
|
if (i === sentences.length - 1 && currentText.length > 0) {
|
||
|
utterances.push({
|
||
|
idx,
|
||
|
text: currentText,
|
||
|
wordOffset,
|
||
|
wordCount: wordTokenizer.tokenize(currentText).length,
|
||
|
voice,
|
||
|
})
|
||
|
}
|
||
|
})
|
||
|
|
||
|
return utterances
|
||
|
}
|
||
|
|
||
|
export const htmlToSpeechFile = (htmlInput: HtmlInput): SpeechFile => {
|
||
|
const { title, content, options } = htmlInput
|
||
|
console.log('creating speech file with options:', options)
|
||
|
const language = options.language || DEFAULT_LANGUAGE
|
||
|
const defaultVoice = options.primaryVoice || DEFAULT_VOICE
|
||
|
|
||
|
const dom = parseHTML(content)
|
||
|
const body = dom.document.querySelector('#readability-page-1')
|
||
|
if (!body) {
|
||
|
console.log('No HTML body found')
|
||
|
return {
|
||
|
wordCount: 0,
|
||
|
language,
|
||
|
defaultVoice,
|
||
|
utterances: [],
|
||
|
}
|
||
|
}
|
||
|
|
||
|
const parsedNodes = parseDomTree(body)
|
||
|
if (parsedNodes.length < 1) {
|
||
|
console.log('No HTML nodes found')
|
||
|
return {
|
||
|
wordCount: 0,
|
||
|
language,
|
||
|
defaultVoice,
|
||
|
utterances: [],
|
||
|
}
|
||
|
}
|
||
|
|
||
|
const wordTokenizer = new WordPunctTokenizer()
|
||
|
const utterances: Utterance[] = []
|
||
|
let wordOffset = 0
|
||
|
if (title) {
|
||
|
// first utterances is the title
|
||
|
const titleUtterance = textToUtterances({
|
||
|
wordTokenizer,
|
||
|
idx: '',
|
||
|
textItems: [stripEmojis(title)], // title could have emoji
|
||
|
wordOffset,
|
||
|
isHtml: false,
|
||
|
voice: defaultVoice,
|
||
|
})[0]
|
||
|
utterances.push(titleUtterance)
|
||
|
wordOffset += titleUtterance.wordCount
|
||
|
}
|
||
|
|
||
|
// start at 3 to skip the #readability-content and #readability-page-1 elements
|
||
|
for (let i = 3; i < parsedNodes.length + 3; i++) {
|
||
|
const textItems: string[] = []
|
||
|
const node = parsedNodes[i - 3]
|
||
|
|
||
|
if (TOP_LEVEL_TAGS.includes(node.nodeName) || hasSignificantText(node)) {
|
||
|
// use paragraph as anchor
|
||
|
const idx = i.toString()
|
||
|
i = emitElement(textItems, node, true)
|
||
|
const newUtterances = textToUtterances({
|
||
|
wordTokenizer,
|
||
|
idx,
|
||
|
textItems,
|
||
|
wordOffset,
|
||
|
voice:
|
||
|
node.nodeName === 'BLOCKQUOTE'
|
||
|
? options.secondaryVoice || defaultVoice
|
||
|
: defaultVoice,
|
||
|
})
|
||
|
const wordCount = newUtterances.reduce((acc, u) => acc + u.wordCount, 0)
|
||
|
wordCount > 0 && utterances.push(...newUtterances)
|
||
|
wordOffset += wordCount
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return {
|
||
|
wordCount: wordOffset,
|
||
|
language,
|
||
|
defaultVoice,
|
||
|
utterances,
|
||
|
}
|
||
|
}
|