dockerfile/examples/omnivore/api/text-to-speech/src/htmlToSsml.ts

import { htmlToText } from 'html-to-text'
import { parseHTML } from 'linkedom'
import {
  SentenceTokenizer,
  SentenceTokenizerNew,
  WordPunctTokenizer,
} from 'natural'

// this code needs to be kept in sync with the
// frontend code in: useReadingProgressAnchor

export interface HtmlInput {
  title?: string
  content: string
  options: SSMLOptions
}

export interface Utterance {
  idx: string
  text: string
  wordOffset: number
  wordCount: number
  voice: string
}

export interface SpeechFile {
  wordCount: number
  language: string
  defaultVoice: string
  utterances: Utterance[]
}

export type SSMLItem = {
  open: string
  close: string
  textItems: string[]
  idx: number
  voice?: string
}

export type SSMLOptions = {
  primaryVoice?: string
  secondaryVoice?: string
  rate?: string
  language?: string
}

const DEFAULT_LANGUAGE = 'en-US'
const DEFAULT_VOICE = 'en-US-JennyNeural'
const DEFAULT_SECONDARY_VOICE = 'en-US-GuyNeural'
const DEFAULT_RATE = '1.1'

const ANCHOR_ELEMENTS_BLOCKED_ATTRIBUTES = [
  'omnivore-highlight-id',
  'data-twitter-tweet-id',
  'data-instagram-id',
]

function ssmlTagsForTopLevelElement() {
  return {
    opening: `<break time="250ms"/>`,
  }
}

const TOP_LEVEL_TAGS = [
  'P',
  'BLOCKQUOTE',
  'H1',
  'H2',
  'H3',
  'H4',
  'H5',
  'H6',
  'LI',
]

function parseDomTree(pageNode: Element) {
  if (!pageNode || pageNode.childNodes.length == 0) {
    console.log('no child nodes found')
    return []
  }

  const nodesToVisitStack = [pageNode]
  const visitedNodeList = []

  while (nodesToVisitStack.length > 0) {
    const currentNode = nodesToVisitStack.pop()
    if (
      currentNode?.nodeType !== 1 /* Node.ELEMENT_NODE */ ||
      // Avoiding dynamic elements from being counted as anchor-allowed elements
      ANCHOR_ELEMENTS_BLOCKED_ATTRIBUTES.some((attrib) =>
        currentNode.hasAttribute(attrib)
      )
    ) {
      continue
    }

    visitedNodeList.push(currentNode)
    ;[].slice
      .call(currentNode.childNodes)
      .reverse()
      .forEach(function (node) {
        nodesToVisitStack.push(node)
      })
  }

  visitedNodeList.shift()
  visitedNodeList.forEach((node, index) => {
    // We start at index 3, because the frontend starts two nodes above us
    // on the #readability-page-1 element that wraps the entire content.
    node.setAttribute('data-omnivore-anchor-idx', (index + 3).toString())
  })
  return visitedNodeList
}

function emit(textItems: string[], text: string) {
  textItems.push(text)
}

function cleanTextNode(textNode: ChildNode): string {
  return stripEmojis(textNode.textContent ?? '')
}

function emitTextNode(
  textItems: string[],
  cleanedText: string,
  textNode: ChildNode
) {
  const ssmlElement =
    textNode.parentNode?.nodeName === 'B' ? 'emphasis' : undefined
  if (!cleanedText) {
    return
  }

  if (ssmlElement) {
    emit(textItems, `<${ssmlElement}>`)
  }
  emit(textItems, `${cleanedText.replace(/\s+/g, ' ')}`)
  if (ssmlElement) {
    emit(textItems, `</${ssmlElement}>`)
  }
}

function emitElement(
  textItems: string[],
  element: Element,
  isTopLevel: boolean
) {
  const SKIP_TAGS = [
    'SCRIPT',
    'STYLE',
    'IMG',
    'FIGURE',
    'FIGCAPTION',
    'IFRAME',
    'CODE',
  ]

  const topLevelTags = ssmlTagsForTopLevelElement()
  const idx = element.getAttribute('data-omnivore-anchor-idx')
  let maxVisitedIdx = Number(idx)

  if (isTopLevel) {
    emit(textItems, topLevelTags.opening)
  }

  for (const child of Array.from(element.childNodes)) {
    if (SKIP_TAGS.indexOf(child.nodeName) >= 0) {
      continue
    }

    if (
      child.nodeType == 3 /* Node.TEXT_NODE */ &&
      (child.textContent?.length ?? 0) > 0
    ) {
      const cleanedText = cleanTextNode(child)
      if (idx && cleanedText.length > 1) {
        // Make sure it's more than just a space
        emit(textItems, `<bookmark mark="${idx}"/>`)
      }
      emitTextNode(textItems, cleanedText, child)
    }
    if (child.nodeType == 1 /* Node.ELEMENT_NODE */) {
      maxVisitedIdx = emitElement(textItems, child as HTMLElement, false)
      if (child.nodeName === 'LI') {
        // add a new line after each list item
        emit(textItems, '\n')
      }
    }
  }

  return Number(maxVisitedIdx)
}

export const startSsml = (options: SSMLOptions, element?: Element): string => {
  const voice =
    element?.nodeName === 'BLOCKQUOTE'
      ? options.secondaryVoice ?? DEFAULT_SECONDARY_VOICE
      : options.primaryVoice ?? DEFAULT_VOICE
  return `<speak xmlns="http://www.w3.org/2001/10/synthesis" version="1.0" xml:lang="${
    options.language || DEFAULT_LANGUAGE
  }"><voice name="${voice}"><prosody rate="${options.rate || DEFAULT_RATE}">`
}

export const endSsml = (): string => {
  return `</prosody></voice></speak>`
}

const hasSignificantText = (node: ChildNode): boolean => {
  let text = ''
  for (const child of Array.from(node.childNodes)) {
    if (child.nodeType === 3 /* Node.TEXT_NODE */) {
      text += child.textContent
    }
  }
  return text.trim().length > 0
}

export const ssmlItemText = (item: SSMLItem): string => {
  return [item.open, ...item.textItems, item.close].join('')
}

export const htmlToSsmlItems = (
  html: string,
  options: SSMLOptions
): SSMLItem[] => {
  console.log('creating ssml with options', options)

  const dom = parseHTML(html)
  const body = dom.document.querySelector('#readability-page-1')
  if (!body) {
    throw new Error('Unable to parse HTML document')
  }

  const parsedNodes = parseDomTree(body)
  if (parsedNodes.length < 1) {
    throw new Error('No HTML nodes found')
  }

  const items: SSMLItem[] = []
  for (let i = 3; i < parsedNodes.length + 3; i++) {
    const textItems: string[] = []
    const node = parsedNodes[i - 3]

    if (TOP_LEVEL_TAGS.includes(node.nodeName) || hasSignificantText(node)) {
      const idx = i
      i = emitElement(textItems, node, true)
      items.push({
        open: startSsml(options, node),
        close: endSsml(),
        textItems: textItems,
        idx,
        voice:
          node.nodeName === 'BLOCKQUOTE' ? options.secondaryVoice : undefined,
      })
    }
  }

  return items
}

export const stripEmojis = (text: string): string => {
  const emojiRegex =
    /(?![*#0-9]+)[\p{Emoji}\p{Emoji_Modifier}\p{Emoji_Component}\p{Emoji_Modifier_Base}\p{Emoji_Presentation}]/gu
  return text.replace(emojiRegex, '').replace(/\s+/g, ' ')
}

const textToUtterances = ({
  wordTokenizer,
  idx,
  textItems,
  wordOffset,
  voice,
  isHtml = true,
}: {
  wordTokenizer: WordPunctTokenizer
  idx: string
  textItems: string[]
  wordOffset: number
  voice: string
  isHtml?: boolean
}): Utterance[] => {
  let text = textItems.join('')
  if (!isHtml) {
    // for title
    return [
      {
        idx,
        text,
        wordOffset,
        wordCount: wordTokenizer.tokenize(text).length,
        voice,
      },
    ]
  }

  const utterances: Utterance[] = []
  try {
    text = htmlToText(text, { wordwrap: false })
  } catch (err) {
    console.error('Unable to convert HTML to text')
    text = parseHTML(text).document.documentElement.textContent ?? text
    console.info('Converted HTML to text')
  }
  const MAX_CHARS = 256
  let sentences: string[] = []
  try {
    // use new sentence tokenizer
    const sentenceTokenizer = new SentenceTokenizerNew()
    sentences = sentenceTokenizer.tokenize(text)
  } catch (err) {
    console.debug('Unable to tokenize sentences')
    // fallback to old sentence tokenizer
    const sentenceTokenizer = new SentenceTokenizer()
    sentences = sentenceTokenizer.tokenize(text)
  }
  let currentText = ''
  // split text to max 256 chars per utterance and
  // use nlp lib to detect sentences and
  // avoid splitting words and sentences
  sentences.forEach((sentence, i) => {
    if (i < sentences.length - 1) {
      // add space to the end of sentence
      sentence += ' '
    }
    const nextText = currentText + sentence
    if (nextText.length > MAX_CHARS) {
      if (currentText.length > 0) {
        const wordCount = wordTokenizer.tokenize(currentText).length
        utterances.push({
          idx,
          text: currentText,
          wordOffset,
          wordCount,
          voice,
        })
        wordOffset += wordCount
        currentText = sentence
      } else {
        const wordCount = wordTokenizer.tokenize(sentence).length
        utterances.push({
          idx,
          text: sentence,
          wordOffset,
          wordCount,
          voice,
        })
        wordOffset += wordCount
      }
    } else {
      currentText = nextText
    }
    if (i === sentences.length - 1 && currentText.length > 0) {
      utterances.push({
        idx,
        text: currentText,
        wordOffset,
        wordCount: wordTokenizer.tokenize(currentText).length,
        voice,
      })
    }
  })

  return utterances
}

export const htmlToSpeechFile = (htmlInput: HtmlInput): SpeechFile => {
  const { title, content, options } = htmlInput
  console.log('creating speech file with options:', options)
  const language = options.language || DEFAULT_LANGUAGE
  const defaultVoice = options.primaryVoice || DEFAULT_VOICE

  const dom = parseHTML(content)
  const body = dom.document.querySelector('#readability-page-1')
  if (!body) {
    console.log('No HTML body found')
    return {
      wordCount: 0,
      language,
      defaultVoice,
      utterances: [],
    }
  }

  const parsedNodes = parseDomTree(body)
  if (parsedNodes.length < 1) {
    console.log('No HTML nodes found')
    return {
      wordCount: 0,
      language,
      defaultVoice,
      utterances: [],
    }
  }

  const wordTokenizer = new WordPunctTokenizer()
  const utterances: Utterance[] = []
  let wordOffset = 0
  if (title) {
    // first utterances is the title
    const titleUtterance = textToUtterances({
      wordTokenizer,
      idx: '',
      textItems: [stripEmojis(title)], // title could have emoji
      wordOffset,
      isHtml: false,
      voice: defaultVoice,
    })[0]
    utterances.push(titleUtterance)
    wordOffset += titleUtterance.wordCount
  }

  // start at 3 to skip the #readability-content and #readability-page-1 elements
  for (let i = 3; i < parsedNodes.length + 3; i++) {
    const textItems: string[] = []
    const node = parsedNodes[i - 3]

    if (TOP_LEVEL_TAGS.includes(node.nodeName) || hasSignificantText(node)) {
      // use paragraph as anchor
      const idx = i.toString()
      i = emitElement(textItems, node, true)
      const newUtterances = textToUtterances({
        wordTokenizer,
        idx,
        textItems,
        wordOffset,
        voice:
          node.nodeName === 'BLOCKQUOTE'
            ? options.secondaryVoice || defaultVoice
            : defaultVoice,
      })
      const wordCount = newUtterances.reduce((acc, u) => acc + u.wordCount, 0)
      wordCount > 0 && utterances.push(...newUtterances)
      wordOffset += wordCount
    }
  }

  return {
    wordCount: wordOffset,
    language,
    defaultVoice,
    utterances,
  }
}