Merge 95fb522e8d into 47ea6b5092

2026-05-16 16:31:30 +00:00 · 2026-05-06 13:51:21 +02:00 · 2026-05-06 13:51:21 +02:00 · 3485db8ee8
commit 3485db8ee8
parent 47ea6b5092 95fb522e8d
3 changed files with 291 additions and 0 deletions
--- a/server/models/PodcastEpisode.js
+++ b/server/models/PodcastEpisode.js
@ -1,5 +1,7 @@
 const { DataTypes, Model } = require('sequelize')
 const libraryItemsPodcastFilters = require('../utils/queries/libraryItemsPodcastFilters')
+const parsePodcastDescriptionForChapters = require('../utils/parsers/parsePodcastDescriptionForChapters')
+const Logger = require('../Logger')
 /**
 * @typedef ChapterObject
 * @property {number} id
@ -85,6 +87,17 @@ class PodcastEpisode extends Model {
      podcastEpisode.chapters = audioFile.chapters.map((ch) => ({ ...ch }))
    } else if (rssPodcastEpisode.chapters?.length) {
      podcastEpisode.chapters = rssPodcastEpisode.chapters.map((ch) => ({ ...ch }))
+    } else {
+      Logger.debug("[PodcastEpisode] New episode doesn't have chapters, attempting to generate them from timestamps", rssPodcastEpisode.title)
+      try {
+        podcastEpisode.chapters = parsePodcastDescriptionForChapters.parse(podcastEpisode.description, podcastEpisode.audioFile.duration)
+
+        if (podcastEpisode.chapters.length > 0) {
+          Logger.info(`[PodcastEpisode] Successfully generated ${podcastEpisode.chapters.length} chapters`)
+        }
+      } catch (error) {
+        Logger.error(`[PodcastEpisode] createFromRssPodcastEpisode: Failed to generate chapters for "${podcastEpisode.title}"`, error)
+      }
    }

    return this.create(podcastEpisode)
--- a/server/utils/parsers/parsePodcastDescriptionForChapters.js
+++ b/server/utils/parsers/parsePodcastDescriptionForChapters.js
@ -0,0 +1,112 @@
+const sanitizeHtml = require('../../libs/sanitizeHtml')
+const Logger = require('../../Logger')
+
+/**
+ * Parse podcast descriptions for timestamps and generate chapters
+ * The following formats are supports:
+ *
+ * MM:SS Chapter name
+ * HH:MM:SS Chapter name
+ * (HH:MM:SS) Chapter name
+ *
+ * Descriptions have to use <p>, <br> or \n to split up lines in order to be supported
+ *
+ * See test suite for more input examples
+ *
+ * @param {string} podcastDescription
+ * @param {number} audioDurationSecs
+ * @returns {ChapterObject[]}
+ */
+module.exports.parse = (podcastDescription, audioDurationSecs) => {
+  if (podcastDescription == null) {
+    throw new Error('Description must not be null')
+  }
+
+  if (audioDurationSecs == null) {
+    throw new Error('Audio duration must not be null')
+  }
+
+  // This number is arbitrary, but there have been examples where descriptions of the chapter are on the same line as the chapter title
+  // This results in a unpleasant UX where the chapter is very long, it's also possible that an overly long chapter title is the result of a parsing failure
+  const maxChapterTitleLength = 200
+
+  const timestampRegex = /\b(\d{1,2}):(\d{1,2})(?::(\d{1,2}))?\b/
+  const chapterTitleRegex = /\b\d{1,2}:\d{1,2}(?::\d{1,2})?\b(?:\s+|\))(.+)$/
+
+  // Split on "</p>", "<br />", "\n", </li>
+  const descriptionLineSplitRegex = /\<\s*\/\s*p\s*\>|\<\s*br\s*\/\>|\n|\<\s*\/\s*li\s*\>/
+
+  // Early out if there aren't any timestamps in the entire description
+  if (timestampRegex.exec(podcastDescription) == null) {
+    Logger.debug('No timestamps found in description, bailing out early')
+    return []
+  }
+
+  var descriptionLines = podcastDescription.split(descriptionLineSplitRegex)
+  var newChapters = []
+
+  for (let i = 0; i < descriptionLines.length; i++) {
+    // Strip all HTML tags out
+    let line = sanitizeHtml(descriptionLines[i], { allowedTags: [] })
+
+    let match = timestampRegex.exec(line)
+    if (match == null) continue
+
+    let first = match[1]
+    let second = match[2]
+    let third = match[3]
+
+    let hours = 0
+    let minutes = 0
+    let seconds = 0
+
+    // If there's three components then we can assume its hh:mm:ss
+    if (first && second && third) {
+      hours = Number(first)
+      minutes = Number(second)
+      seconds = Number(third)
+    } else if (first && second) // otherwise assume mm:ss
+    {
+      minutes = Number(first)
+      seconds = Number(second)
+    }
+
+    if (minutes > 59 || seconds > 59) {
+      throw new Error(`Timestamp contains invalid minutes or seconds field '${minutes}::${seconds}'`)
+    }
+
+    let startTime = seconds + minutes * 60 + hours * 60 * 60
+    if (startTime > audioDurationSecs) {
+      throw new Error(`Chapter found that starts after over audio duration. Duration: ${audioDurationSecs}s - Chapter start ${startTime}s`)
+    }
+
+    let chapterTitleMatch = chapterTitleRegex.exec(line)
+
+    if (chapterTitleMatch == null || chapterTitleMatch.length < 2) {
+      // Unknown chapter state
+      throw new Error(`Unable to get chapter title from description, line ${line}`)
+    }
+
+    let chapterTitle = chapterTitleMatch[1].trim()
+    if (chapterTitle.length > maxChapterTitleLength) {
+      throw new Error(`Chapter title too long, possible parsing falure, line ${line}`)
+    }
+
+    let chapter = { title: chapterTitle, id: newChapters.length + 1, start: startTime }
+
+    if (newChapters.length > 0) {
+      newChapters[newChapters.length - 1].end = startTime
+    }
+
+    newChapters.push(chapter)
+  }
+  if (newChapters.length > 0) {
+    newChapters[newChapters.length - 1].end = audioDurationSecs
+  }
+
+  if (newChapters.length == 1) {
+    throw new Error('Only one chapter found, treating as invalid description')
+  }
+
+  return newChapters
+}