dockerfile/examples/omnivore/api/text-to-speech/src/htmlToSsml.ts

445 lines
11 KiB
TypeScript
Raw Normal View History

2024-03-15 14:52:38 +08:00
import { htmlToText } from 'html-to-text'
import { parseHTML } from 'linkedom'
import {
SentenceTokenizer,
SentenceTokenizerNew,
WordPunctTokenizer,
} from 'natural'
// this code needs to be kept in sync with the
// frontend code in: useReadingProgressAnchor
export interface HtmlInput {
title?: string
content: string
options: SSMLOptions
}
export interface Utterance {
idx: string
text: string
wordOffset: number
wordCount: number
voice: string
}
export interface SpeechFile {
wordCount: number
language: string
defaultVoice: string
utterances: Utterance[]
}
export type SSMLItem = {
open: string
close: string
textItems: string[]
idx: number
voice?: string
}
export type SSMLOptions = {
primaryVoice?: string
secondaryVoice?: string
rate?: string
language?: string
}
const DEFAULT_LANGUAGE = 'en-US'
const DEFAULT_VOICE = 'en-US-JennyNeural'
const DEFAULT_SECONDARY_VOICE = 'en-US-GuyNeural'
const DEFAULT_RATE = '1.1'
const ANCHOR_ELEMENTS_BLOCKED_ATTRIBUTES = [
'omnivore-highlight-id',
'data-twitter-tweet-id',
'data-instagram-id',
]
function ssmlTagsForTopLevelElement() {
return {
opening: `<break time="250ms"/>`,
}
}
const TOP_LEVEL_TAGS = [
'P',
'BLOCKQUOTE',
'H1',
'H2',
'H3',
'H4',
'H5',
'H6',
'LI',
]
function parseDomTree(pageNode: Element) {
if (!pageNode || pageNode.childNodes.length == 0) {
console.log('no child nodes found')
return []
}
const nodesToVisitStack = [pageNode]
const visitedNodeList = []
while (nodesToVisitStack.length > 0) {
const currentNode = nodesToVisitStack.pop()
if (
currentNode?.nodeType !== 1 /* Node.ELEMENT_NODE */ ||
// Avoiding dynamic elements from being counted as anchor-allowed elements
ANCHOR_ELEMENTS_BLOCKED_ATTRIBUTES.some((attrib) =>
currentNode.hasAttribute(attrib)
)
) {
continue
}
visitedNodeList.push(currentNode)
;[].slice
.call(currentNode.childNodes)
.reverse()
.forEach(function (node) {
nodesToVisitStack.push(node)
})
}
visitedNodeList.shift()
visitedNodeList.forEach((node, index) => {
// We start at index 3, because the frontend starts two nodes above us
// on the #readability-page-1 element that wraps the entire content.
node.setAttribute('data-omnivore-anchor-idx', (index + 3).toString())
})
return visitedNodeList
}
function emit(textItems: string[], text: string) {
textItems.push(text)
}
function cleanTextNode(textNode: ChildNode): string {
return stripEmojis(textNode.textContent ?? '')
}
function emitTextNode(
textItems: string[],
cleanedText: string,
textNode: ChildNode
) {
const ssmlElement =
textNode.parentNode?.nodeName === 'B' ? 'emphasis' : undefined
if (!cleanedText) {
return
}
if (ssmlElement) {
emit(textItems, `<${ssmlElement}>`)
}
emit(textItems, `${cleanedText.replace(/\s+/g, ' ')}`)
if (ssmlElement) {
emit(textItems, `</${ssmlElement}>`)
}
}
function emitElement(
textItems: string[],
element: Element,
isTopLevel: boolean
) {
const SKIP_TAGS = [
'SCRIPT',
'STYLE',
'IMG',
'FIGURE',
'FIGCAPTION',
'IFRAME',
'CODE',
]
const topLevelTags = ssmlTagsForTopLevelElement()
const idx = element.getAttribute('data-omnivore-anchor-idx')
let maxVisitedIdx = Number(idx)
if (isTopLevel) {
emit(textItems, topLevelTags.opening)
}
for (const child of Array.from(element.childNodes)) {
if (SKIP_TAGS.indexOf(child.nodeName) >= 0) {
continue
}
if (
child.nodeType == 3 /* Node.TEXT_NODE */ &&
(child.textContent?.length ?? 0) > 0
) {
const cleanedText = cleanTextNode(child)
if (idx && cleanedText.length > 1) {
// Make sure it's more than just a space
emit(textItems, `<bookmark mark="${idx}"/>`)
}
emitTextNode(textItems, cleanedText, child)
}
if (child.nodeType == 1 /* Node.ELEMENT_NODE */) {
maxVisitedIdx = emitElement(textItems, child as HTMLElement, false)
if (child.nodeName === 'LI') {
// add a new line after each list item
emit(textItems, '\n')
}
}
}
return Number(maxVisitedIdx)
}
export const startSsml = (options: SSMLOptions, element?: Element): string => {
const voice =
element?.nodeName === 'BLOCKQUOTE'
? options.secondaryVoice ?? DEFAULT_SECONDARY_VOICE
: options.primaryVoice ?? DEFAULT_VOICE
return `<speak xmlns="http://www.w3.org/2001/10/synthesis" version="1.0" xml:lang="${
options.language || DEFAULT_LANGUAGE
}"><voice name="${voice}"><prosody rate="${options.rate || DEFAULT_RATE}">`
}
export const endSsml = (): string => {
return `</prosody></voice></speak>`
}
const hasSignificantText = (node: ChildNode): boolean => {
let text = ''
for (const child of Array.from(node.childNodes)) {
if (child.nodeType === 3 /* Node.TEXT_NODE */) {
text += child.textContent
}
}
return text.trim().length > 0
}
export const ssmlItemText = (item: SSMLItem): string => {
return [item.open, ...item.textItems, item.close].join('')
}
export const htmlToSsmlItems = (
html: string,
options: SSMLOptions
): SSMLItem[] => {
console.log('creating ssml with options', options)
const dom = parseHTML(html)
const body = dom.document.querySelector('#readability-page-1')
if (!body) {
throw new Error('Unable to parse HTML document')
}
const parsedNodes = parseDomTree(body)
if (parsedNodes.length < 1) {
throw new Error('No HTML nodes found')
}
const items: SSMLItem[] = []
for (let i = 3; i < parsedNodes.length + 3; i++) {
const textItems: string[] = []
const node = parsedNodes[i - 3]
if (TOP_LEVEL_TAGS.includes(node.nodeName) || hasSignificantText(node)) {
const idx = i
i = emitElement(textItems, node, true)
items.push({
open: startSsml(options, node),
close: endSsml(),
textItems: textItems,
idx,
voice:
node.nodeName === 'BLOCKQUOTE' ? options.secondaryVoice : undefined,
})
}
}
return items
}
export const stripEmojis = (text: string): string => {
const emojiRegex =
/(?![*#0-9]+)[\p{Emoji}\p{Emoji_Modifier}\p{Emoji_Component}\p{Emoji_Modifier_Base}\p{Emoji_Presentation}]/gu
return text.replace(emojiRegex, '').replace(/\s+/g, ' ')
}
const textToUtterances = ({
wordTokenizer,
idx,
textItems,
wordOffset,
voice,
isHtml = true,
}: {
wordTokenizer: WordPunctTokenizer
idx: string
textItems: string[]
wordOffset: number
voice: string
isHtml?: boolean
}): Utterance[] => {
let text = textItems.join('')
if (!isHtml) {
// for title
return [
{
idx,
text,
wordOffset,
wordCount: wordTokenizer.tokenize(text).length,
voice,
},
]
}
const utterances: Utterance[] = []
try {
text = htmlToText(text, { wordwrap: false })
} catch (err) {
console.error('Unable to convert HTML to text')
text = parseHTML(text).document.documentElement.textContent ?? text
console.info('Converted HTML to text')
}
const MAX_CHARS = 256
let sentences: string[] = []
try {
// use new sentence tokenizer
const sentenceTokenizer = new SentenceTokenizerNew()
sentences = sentenceTokenizer.tokenize(text)
} catch (err) {
console.debug('Unable to tokenize sentences')
// fallback to old sentence tokenizer
const sentenceTokenizer = new SentenceTokenizer()
sentences = sentenceTokenizer.tokenize(text)
}
let currentText = ''
// split text to max 256 chars per utterance and
// use nlp lib to detect sentences and
// avoid splitting words and sentences
sentences.forEach((sentence, i) => {
if (i < sentences.length - 1) {
// add space to the end of sentence
sentence += ' '
}
const nextText = currentText + sentence
if (nextText.length > MAX_CHARS) {
if (currentText.length > 0) {
const wordCount = wordTokenizer.tokenize(currentText).length
utterances.push({
idx,
text: currentText,
wordOffset,
wordCount,
voice,
})
wordOffset += wordCount
currentText = sentence
} else {
const wordCount = wordTokenizer.tokenize(sentence).length
utterances.push({
idx,
text: sentence,
wordOffset,
wordCount,
voice,
})
wordOffset += wordCount
}
} else {
currentText = nextText
}
if (i === sentences.length - 1 && currentText.length > 0) {
utterances.push({
idx,
text: currentText,
wordOffset,
wordCount: wordTokenizer.tokenize(currentText).length,
voice,
})
}
})
return utterances
}
export const htmlToSpeechFile = (htmlInput: HtmlInput): SpeechFile => {
const { title, content, options } = htmlInput
console.log('creating speech file with options:', options)
const language = options.language || DEFAULT_LANGUAGE
const defaultVoice = options.primaryVoice || DEFAULT_VOICE
const dom = parseHTML(content)
const body = dom.document.querySelector('#readability-page-1')
if (!body) {
console.log('No HTML body found')
return {
wordCount: 0,
language,
defaultVoice,
utterances: [],
}
}
const parsedNodes = parseDomTree(body)
if (parsedNodes.length < 1) {
console.log('No HTML nodes found')
return {
wordCount: 0,
language,
defaultVoice,
utterances: [],
}
}
const wordTokenizer = new WordPunctTokenizer()
const utterances: Utterance[] = []
let wordOffset = 0
if (title) {
// first utterances is the title
const titleUtterance = textToUtterances({
wordTokenizer,
idx: '',
textItems: [stripEmojis(title)], // title could have emoji
wordOffset,
isHtml: false,
voice: defaultVoice,
})[0]
utterances.push(titleUtterance)
wordOffset += titleUtterance.wordCount
}
// start at 3 to skip the #readability-content and #readability-page-1 elements
for (let i = 3; i < parsedNodes.length + 3; i++) {
const textItems: string[] = []
const node = parsedNodes[i - 3]
if (TOP_LEVEL_TAGS.includes(node.nodeName) || hasSignificantText(node)) {
// use paragraph as anchor
const idx = i.toString()
i = emitElement(textItems, node, true)
const newUtterances = textToUtterances({
wordTokenizer,
idx,
textItems,
wordOffset,
voice:
node.nodeName === 'BLOCKQUOTE'
? options.secondaryVoice || defaultVoice
: defaultVoice,
})
const wordCount = newUtterances.reduce((acc, u) => acc + u.wordCount, 0)
wordCount > 0 && utterances.push(...newUtterances)
wordOffset += wordCount
}
}
return {
wordCount: wordOffset,
language,
defaultVoice,
utterances,
}
}