audiobookshelf/server/utils/parsers/parsePodcastDescriptionForChapters.js

const sanitizeHtml = require('../../libs/sanitizeHtml')
const Logger = require('../../Logger')

/**
 * Parse podcast descriptions for timestamps and generate chapters
 * The following formats are supports:
 *
 * MM:SS Chapter name
 * HH:MM:SS Chapter name
 * (HH:MM:SS) Chapter name
 *
 * Descriptions have to use <p>, <br> or \n to split up lines in order to be supported
 *
 * See test suite for more input examples
 *
 * @param {string} podcastDescription
 * @param {number} audioDurationSecs
 * @returns {ChapterObject[]}
 */
module.exports.parse = (podcastDescription, audioDurationSecs) => {
  if (podcastDescription == null) {
    throw new Error('Description must not be null')
  }

  if (audioDurationSecs == null) {
    throw new Error('Audio duration must not be null')
  }

  // This number is arbitrary, but there have been examples where descriptions of the chapter are on the same line as the chapter title
  // This results in a unpleasant UX where the chapter is very long, it's also possible that an overly long chapter title is the result of a parsing failure
  const maxChapterTitleLength = 200

  const timestampRegex = /\b(\d{1,2}):(\d{1,2})(?::(\d{1,2}))?\b/
  const chapterTitleRegex = /\b\d{1,2}:\d{1,2}(?::\d{1,2})?\b(?:\s+|\))(.+)$/

  // Split on "</p>", "<br />", "\n", </li>
  const descriptionLineSplitRegex = /\<\s*\/\s*p\s*\>|\<\s*br\s*\/\>|\n|\<\s*\/\s*li\s*\>/

  var descriptionLines = podcastDescription.split(descriptionLineSplitRegex)
  var newChapters = []

  for (let i = 0; i < descriptionLines.length; i++) {
    // Strip all HTML tags out
    let line = sanitizeHtml(descriptionLines[i], { allowedTags: [] })

    let match = timestampRegex.exec(line)
    if (match == null) continue

    let first = match[1]
    let second = match[2]
    let third = match[3]

    let hours = 0
    let minutes = 0
    let seconds = 0

    // If there's three components then we can assume its hh:mm:ss
    if (first && second && third) {
      hours = Number(first)
      minutes = Number(second)
      seconds = Number(third)
    } else if (first && second) // otherwise assume mm:ss
    {
      minutes = Number(first)
      seconds = Number(second)
    }

    if (minutes > 59 || seconds > 59) {
      throw new Error(`Timestamp contains invalid minutes or seconds field '${minutes}::${seconds}'`)
    }

    let startTime = seconds + minutes * 60 + hours * 60 * 60
    if (startTime > audioDurationSecs) {
      throw new Error(`Chapter found that starts after over audio duration. Duration: ${audioDurationSecs}s - Chapter start ${startTime}s`)
    }

    let chapterTitleMatch = chapterTitleRegex.exec(line)

    if (chapterTitleMatch == null || chapterTitleMatch.length < 2) {
      // Unknown chapter state
      throw new Error(`Unable to get chapter title from description, line ${line}`)
    }

    let chapterTitle = chapterTitleMatch[1].trim()
    if (chapterTitle.length > maxChapterTitleLength) {
      throw new Error(`Chapter title too long, possible parsing falure, line ${line}`)
    }

    let chapter = { title: chapterTitle, id: newChapters.length + 1, start: startTime }

    if (newChapters.length > 0) {
      newChapters[newChapters.length - 1].end = startTime
    }

    newChapters.push(chapter)
  }
  if (newChapters.length > 0) {
    newChapters[newChapters.length - 1].end = audioDurationSecs
  }

  Logger.info(`Successfully generated ${newChapters.length} chapters`)

  if (newChapters.length == 1) {
    throw new Error('Only one chapter found, treating as invalid description')
  }

  return newChapters
}
Handle podcasts which use html lists and also have html tags in the chapter titles 2026-03-16 20:59:00 +00:00			`const sanitizeHtml = require('../../libs/sanitizeHtml')`
- Add new migration to add an autoGenerateChapters column in the Podcasts table - Bump minor version (I wasn't sure if this was needed for the migration) - Feature is now controlled by the field in the podcast database object - Move parsing code and tests to existing utils/parsers/ dir - Add more test cases 2026-03-16 18:42:01 +00:00			`const Logger = require('../../Logger')`

			`/**`
			`* Parse podcast descriptions for timestamps and generate chapters`
			`* The following formats are supports:`
			`*`
			`* MM:SS Chapter name`
			`* HH:MM:SS Chapter name`
			`* (HH:MM:SS) Chapter name`
			`*`
			`* Descriptions have to use <p>, <br> or \n to split up lines in order to be supported`
			`*`
			`* See test suite for more input examples`
			`*`
			`* @param {string} podcastDescription`
			`* @param {number} audioDurationSecs`
			`* @returns {ChapterObject[]}`
			`*/`
			`module.exports.parse = (podcastDescription, audioDurationSecs) => {`
			`if (podcastDescription == null) {`
			`throw new Error('Description must not be null')`
			`}`

			`if (audioDurationSecs == null) {`
			`throw new Error('Audio duration must not be null')`
			`}`

Handle chapters names that are very long, add examples to tests 2026-03-17 18:52:56 +00:00			`// This number is arbitrary, but there have been examples where descriptions of the chapter are on the same line as the chapter title`
			`// This results in a unpleasant UX where the chapter is very long, it's also possible that an overly long chapter title is the result of a parsing failure`
			`const maxChapterTitleLength = 200`

- Add new migration to add an autoGenerateChapters column in the Podcasts table - Bump minor version (I wasn't sure if this was needed for the migration) - Feature is now controlled by the field in the podcast database object - Move parsing code and tests to existing utils/parsers/ dir - Add more test cases 2026-03-16 18:42:01 +00:00			`const timestampRegex = /\b(\d{1,2}):(\d{1,2})(?::(\d{1,2}))?\b/`
			`const chapterTitleRegex = /\b\d{1,2}:\d{1,2}(?::\d{1,2})?\b(?:\s+\|\))(.+)$/`
Handle podcasts which use html lists and also have html tags in the chapter titles 2026-03-16 20:59:00 +00:00
			`// Split on "</p>", "<br />", "\n", </li>`
			`const descriptionLineSplitRegex = /\<\s\/\sp\s\>\|\<\sbr\s\/\>\|\n\|\<\s\/\sli\s\>/`
- Add new migration to add an autoGenerateChapters column in the Podcasts table - Bump minor version (I wasn't sure if this was needed for the migration) - Feature is now controlled by the field in the podcast database object - Move parsing code and tests to existing utils/parsers/ dir - Add more test cases 2026-03-16 18:42:01 +00:00
			`var descriptionLines = podcastDescription.split(descriptionLineSplitRegex)`
			`var newChapters = []`

			`for (let i = 0; i < descriptionLines.length; i++) {`
Handle podcasts which use html lists and also have html tags in the chapter titles 2026-03-16 20:59:00 +00:00			`// Strip all HTML tags out`
			`let line = sanitizeHtml(descriptionLines[i], { allowedTags: [] })`
- Add new migration to add an autoGenerateChapters column in the Podcasts table - Bump minor version (I wasn't sure if this was needed for the migration) - Feature is now controlled by the field in the podcast database object - Move parsing code and tests to existing utils/parsers/ dir - Add more test cases 2026-03-16 18:42:01 +00:00
			`let match = timestampRegex.exec(line)`
			`if (match == null) continue`

			`let first = match[1]`
			`let second = match[2]`
			`let third = match[3]`

			`let hours = 0`
			`let minutes = 0`
			`let seconds = 0`

			`// If there's three components then we can assume its hh:mm:ss`
			`if (first && second && third) {`
			`hours = Number(first)`
			`minutes = Number(second)`
			`seconds = Number(third)`
			`} else if (first && second) // otherwise assume mm:ss`
			`{`
			`minutes = Number(first)`
			`seconds = Number(second)`
			`}`

			`if (minutes > 59 \|\| seconds > 59) {`
			throw new Error(`Timestamp contains invalid minutes or seconds field '${minutes}::${seconds}'`)
			`}`

			`let startTime = seconds + minutes * 60 + hours * 60 * 60`
			`if (startTime > audioDurationSecs) {`
			throw new Error(`Chapter found that starts after over audio duration. Duration: ${audioDurationSecs}s - Chapter start ${startTime}s`)
			`}`

			`let chapterTitleMatch = chapterTitleRegex.exec(line)`

			`if (chapterTitleMatch == null \|\| chapterTitleMatch.length < 2) {`
			`// Unknown chapter state`
			throw new Error(`Unable to get chapter title from description, line ${line}`)
			`}`

Handle chapters names that are very long, add examples to tests 2026-03-17 18:52:56 +00:00			`let chapterTitle = chapterTitleMatch[1].trim()`
			`if (chapterTitle.length > maxChapterTitleLength) {`
			throw new Error(`Chapter title too long, possible parsing falure, line ${line}`)
			`}`

			`let chapter = { title: chapterTitle, id: newChapters.length + 1, start: startTime }`
- Add new migration to add an autoGenerateChapters column in the Podcasts table - Bump minor version (I wasn't sure if this was needed for the migration) - Feature is now controlled by the field in the podcast database object - Move parsing code and tests to existing utils/parsers/ dir - Add more test cases 2026-03-16 18:42:01 +00:00
			`if (newChapters.length > 0) {`
			`newChapters[newChapters.length - 1].end = startTime`
			`}`

			`newChapters.push(chapter)`
			`}`
			`if (newChapters.length > 0) {`
			`newChapters[newChapters.length - 1].end = audioDurationSecs`
			`}`

Handle podcasts which use html lists and also have html tags in the chapter titles 2026-03-16 20:59:00 +00:00			Logger.info(`Successfully generated ${newChapters.length} chapters`)
- Add new migration to add an autoGenerateChapters column in the Podcasts table - Bump minor version (I wasn't sure if this was needed for the migration) - Feature is now controlled by the field in the podcast database object - Move parsing code and tests to existing utils/parsers/ dir - Add more test cases 2026-03-16 18:42:01 +00:00
			`if (newChapters.length == 1) {`
			`throw new Error('Only one chapter found, treating as invalid description')`
			`}`

			`return newChapters`
			`}`