import { htmlToText } from 'html-to-text' import { parseHTML } from 'linkedom' import { SentenceTokenizer, SentenceTokenizerNew, WordPunctTokenizer, } from 'natural' // this code needs to be kept in sync with the // frontend code in: useReadingProgressAnchor export interface HtmlInput { title?: string content: string options: SSMLOptions } export interface Utterance { idx: string text: string wordOffset: number wordCount: number voice: string } export interface SpeechFile { wordCount: number language: string defaultVoice: string utterances: Utterance[] } export type SSMLItem = { open: string close: string textItems: string[] idx: number voice?: string } export type SSMLOptions = { primaryVoice?: string secondaryVoice?: string rate?: string language?: string } const DEFAULT_LANGUAGE = 'en-US' const DEFAULT_VOICE = 'en-US-JennyNeural' const DEFAULT_SECONDARY_VOICE = 'en-US-GuyNeural' const DEFAULT_RATE = '1.1' const ANCHOR_ELEMENTS_BLOCKED_ATTRIBUTES = [ 'omnivore-highlight-id', 'data-twitter-tweet-id', 'data-instagram-id', ] function ssmlTagsForTopLevelElement() { return { opening: ``, } } const TOP_LEVEL_TAGS = [ 'P', 'BLOCKQUOTE', 'H1', 'H2', 'H3', 'H4', 'H5', 'H6', 'LI', ] function parseDomTree(pageNode: Element) { if (!pageNode || pageNode.childNodes.length == 0) { console.log('no child nodes found') return [] } const nodesToVisitStack = [pageNode] const visitedNodeList = [] while (nodesToVisitStack.length > 0) { const currentNode = nodesToVisitStack.pop() if ( currentNode?.nodeType !== 1 /* Node.ELEMENT_NODE */ || // Avoiding dynamic elements from being counted as anchor-allowed elements ANCHOR_ELEMENTS_BLOCKED_ATTRIBUTES.some((attrib) => currentNode.hasAttribute(attrib) ) ) { continue } visitedNodeList.push(currentNode) ;[].slice .call(currentNode.childNodes) .reverse() .forEach(function (node) { nodesToVisitStack.push(node) }) } visitedNodeList.shift() visitedNodeList.forEach((node, index) => { // We start at index 3, because the frontend starts two nodes above us // on the #readability-page-1 element that wraps the entire content. node.setAttribute('data-omnivore-anchor-idx', (index + 3).toString()) }) return visitedNodeList } function emit(textItems: string[], text: string) { textItems.push(text) } function cleanTextNode(textNode: ChildNode): string { return stripEmojis(textNode.textContent ?? '') } function emitTextNode( textItems: string[], cleanedText: string, textNode: ChildNode ) { const ssmlElement = textNode.parentNode?.nodeName === 'B' ? 'emphasis' : undefined if (!cleanedText) { return } if (ssmlElement) { emit(textItems, `<${ssmlElement}>`) } emit(textItems, `${cleanedText.replace(/\s+/g, ' ')}`) if (ssmlElement) { emit(textItems, ``) } } function emitElement( textItems: string[], element: Element, isTopLevel: boolean ) { const SKIP_TAGS = [ 'SCRIPT', 'STYLE', 'IMG', 'FIGURE', 'FIGCAPTION', 'IFRAME', 'CODE', ] const topLevelTags = ssmlTagsForTopLevelElement() const idx = element.getAttribute('data-omnivore-anchor-idx') let maxVisitedIdx = Number(idx) if (isTopLevel) { emit(textItems, topLevelTags.opening) } for (const child of Array.from(element.childNodes)) { if (SKIP_TAGS.indexOf(child.nodeName) >= 0) { continue } if ( child.nodeType == 3 /* Node.TEXT_NODE */ && (child.textContent?.length ?? 0) > 0 ) { const cleanedText = cleanTextNode(child) if (idx && cleanedText.length > 1) { // Make sure it's more than just a space emit(textItems, ``) } emitTextNode(textItems, cleanedText, child) } if (child.nodeType == 1 /* Node.ELEMENT_NODE */) { maxVisitedIdx = emitElement(textItems, child as HTMLElement, false) if (child.nodeName === 'LI') { // add a new line after each list item emit(textItems, '\n') } } } return Number(maxVisitedIdx) } export const startSsml = (options: SSMLOptions, element?: Element): string => { const voice = element?.nodeName === 'BLOCKQUOTE' ? options.secondaryVoice ?? DEFAULT_SECONDARY_VOICE : options.primaryVoice ?? DEFAULT_VOICE return `` } export const endSsml = (): string => { return `` } const hasSignificantText = (node: ChildNode): boolean => { let text = '' for (const child of Array.from(node.childNodes)) { if (child.nodeType === 3 /* Node.TEXT_NODE */) { text += child.textContent } } return text.trim().length > 0 } export const ssmlItemText = (item: SSMLItem): string => { return [item.open, ...item.textItems, item.close].join('') } export const htmlToSsmlItems = ( html: string, options: SSMLOptions ): SSMLItem[] => { console.log('creating ssml with options', options) const dom = parseHTML(html) const body = dom.document.querySelector('#readability-page-1') if (!body) { throw new Error('Unable to parse HTML document') } const parsedNodes = parseDomTree(body) if (parsedNodes.length < 1) { throw new Error('No HTML nodes found') } const items: SSMLItem[] = [] for (let i = 3; i < parsedNodes.length + 3; i++) { const textItems: string[] = [] const node = parsedNodes[i - 3] if (TOP_LEVEL_TAGS.includes(node.nodeName) || hasSignificantText(node)) { const idx = i i = emitElement(textItems, node, true) items.push({ open: startSsml(options, node), close: endSsml(), textItems: textItems, idx, voice: node.nodeName === 'BLOCKQUOTE' ? options.secondaryVoice : undefined, }) } } return items } export const stripEmojis = (text: string): string => { const emojiRegex = /(?![*#0-9]+)[\p{Emoji}\p{Emoji_Modifier}\p{Emoji_Component}\p{Emoji_Modifier_Base}\p{Emoji_Presentation}]/gu return text.replace(emojiRegex, '').replace(/\s+/g, ' ') } const textToUtterances = ({ wordTokenizer, idx, textItems, wordOffset, voice, isHtml = true, }: { wordTokenizer: WordPunctTokenizer idx: string textItems: string[] wordOffset: number voice: string isHtml?: boolean }): Utterance[] => { let text = textItems.join('') if (!isHtml) { // for title return [ { idx, text, wordOffset, wordCount: wordTokenizer.tokenize(text).length, voice, }, ] } const utterances: Utterance[] = [] try { text = htmlToText(text, { wordwrap: false }) } catch (err) { console.error('Unable to convert HTML to text') text = parseHTML(text).document.documentElement.textContent ?? text console.info('Converted HTML to text') } const MAX_CHARS = 256 let sentences: string[] = [] try { // use new sentence tokenizer const sentenceTokenizer = new SentenceTokenizerNew() sentences = sentenceTokenizer.tokenize(text) } catch (err) { console.debug('Unable to tokenize sentences') // fallback to old sentence tokenizer const sentenceTokenizer = new SentenceTokenizer() sentences = sentenceTokenizer.tokenize(text) } let currentText = '' // split text to max 256 chars per utterance and // use nlp lib to detect sentences and // avoid splitting words and sentences sentences.forEach((sentence, i) => { if (i < sentences.length - 1) { // add space to the end of sentence sentence += ' ' } const nextText = currentText + sentence if (nextText.length > MAX_CHARS) { if (currentText.length > 0) { const wordCount = wordTokenizer.tokenize(currentText).length utterances.push({ idx, text: currentText, wordOffset, wordCount, voice, }) wordOffset += wordCount currentText = sentence } else { const wordCount = wordTokenizer.tokenize(sentence).length utterances.push({ idx, text: sentence, wordOffset, wordCount, voice, }) wordOffset += wordCount } } else { currentText = nextText } if (i === sentences.length - 1 && currentText.length > 0) { utterances.push({ idx, text: currentText, wordOffset, wordCount: wordTokenizer.tokenize(currentText).length, voice, }) } }) return utterances } export const htmlToSpeechFile = (htmlInput: HtmlInput): SpeechFile => { const { title, content, options } = htmlInput console.log('creating speech file with options:', options) const language = options.language || DEFAULT_LANGUAGE const defaultVoice = options.primaryVoice || DEFAULT_VOICE const dom = parseHTML(content) const body = dom.document.querySelector('#readability-page-1') if (!body) { console.log('No HTML body found') return { wordCount: 0, language, defaultVoice, utterances: [], } } const parsedNodes = parseDomTree(body) if (parsedNodes.length < 1) { console.log('No HTML nodes found') return { wordCount: 0, language, defaultVoice, utterances: [], } } const wordTokenizer = new WordPunctTokenizer() const utterances: Utterance[] = [] let wordOffset = 0 if (title) { // first utterances is the title const titleUtterance = textToUtterances({ wordTokenizer, idx: '', textItems: [stripEmojis(title)], // title could have emoji wordOffset, isHtml: false, voice: defaultVoice, })[0] utterances.push(titleUtterance) wordOffset += titleUtterance.wordCount } // start at 3 to skip the #readability-content and #readability-page-1 elements for (let i = 3; i < parsedNodes.length + 3; i++) { const textItems: string[] = [] const node = parsedNodes[i - 3] if (TOP_LEVEL_TAGS.includes(node.nodeName) || hasSignificantText(node)) { // use paragraph as anchor const idx = i.toString() i = emitElement(textItems, node, true) const newUtterances = textToUtterances({ wordTokenizer, idx, textItems, wordOffset, voice: node.nodeName === 'BLOCKQUOTE' ? options.secondaryVoice || defaultVoice : defaultVoice, }) const wordCount = newUtterances.reduce((acc, u) => acc + u.wordCount, 0) wordCount > 0 && utterances.push(...newUtterances) wordOffset += wordCount } } return { wordCount: wordOffset, language, defaultVoice, utterances, } }