This commit is contained in:
Harry 2026-05-06 13:51:21 +02:00 committed by GitHub
commit 3485db8ee8
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 291 additions and 0 deletions

View file

@ -1,5 +1,7 @@
const { DataTypes, Model } = require('sequelize')
const libraryItemsPodcastFilters = require('../utils/queries/libraryItemsPodcastFilters')
const parsePodcastDescriptionForChapters = require('../utils/parsers/parsePodcastDescriptionForChapters')
const Logger = require('../Logger')
/**
* @typedef ChapterObject
* @property {number} id
@ -85,6 +87,17 @@ class PodcastEpisode extends Model {
podcastEpisode.chapters = audioFile.chapters.map((ch) => ({ ...ch }))
} else if (rssPodcastEpisode.chapters?.length) {
podcastEpisode.chapters = rssPodcastEpisode.chapters.map((ch) => ({ ...ch }))
} else {
Logger.debug("[PodcastEpisode] New episode doesn't have chapters, attempting to generate them from timestamps", rssPodcastEpisode.title)
try {
podcastEpisode.chapters = parsePodcastDescriptionForChapters.parse(podcastEpisode.description, podcastEpisode.audioFile.duration)
if (podcastEpisode.chapters.length > 0) {
Logger.info(`[PodcastEpisode] Successfully generated ${podcastEpisode.chapters.length} chapters`)
}
} catch (error) {
Logger.error(`[PodcastEpisode] createFromRssPodcastEpisode: Failed to generate chapters for "${podcastEpisode.title}"`, error)
}
}
return this.create(podcastEpisode)

View file

@ -0,0 +1,112 @@
const sanitizeHtml = require('../../libs/sanitizeHtml')
const Logger = require('../../Logger')
/**
* Parse podcast descriptions for timestamps and generate chapters
* The following formats are supports:
*
* MM:SS Chapter name
* HH:MM:SS Chapter name
* (HH:MM:SS) Chapter name
*
* Descriptions have to use <p>, <br> or \n to split up lines in order to be supported
*
* See test suite for more input examples
*
* @param {string} podcastDescription
* @param {number} audioDurationSecs
* @returns {ChapterObject[]}
*/
module.exports.parse = (podcastDescription, audioDurationSecs) => {
if (podcastDescription == null) {
throw new Error('Description must not be null')
}
if (audioDurationSecs == null) {
throw new Error('Audio duration must not be null')
}
// This number is arbitrary, but there have been examples where descriptions of the chapter are on the same line as the chapter title
// This results in a unpleasant UX where the chapter is very long, it's also possible that an overly long chapter title is the result of a parsing failure
const maxChapterTitleLength = 200
const timestampRegex = /\b(\d{1,2}):(\d{1,2})(?::(\d{1,2}))?\b/
const chapterTitleRegex = /\b\d{1,2}:\d{1,2}(?::\d{1,2})?\b(?:\s+|\))(.+)$/
// Split on "</p>", "<br />", "\n", </li>
const descriptionLineSplitRegex = /\<\s*\/\s*p\s*\>|\<\s*br\s*\/\>|\n|\<\s*\/\s*li\s*\>/
// Early out if there aren't any timestamps in the entire description
if (timestampRegex.exec(podcastDescription) == null) {
Logger.debug('No timestamps found in description, bailing out early')
return []
}
var descriptionLines = podcastDescription.split(descriptionLineSplitRegex)
var newChapters = []
for (let i = 0; i < descriptionLines.length; i++) {
// Strip all HTML tags out
let line = sanitizeHtml(descriptionLines[i], { allowedTags: [] })
let match = timestampRegex.exec(line)
if (match == null) continue
let first = match[1]
let second = match[2]
let third = match[3]
let hours = 0
let minutes = 0
let seconds = 0
// If there's three components then we can assume its hh:mm:ss
if (first && second && third) {
hours = Number(first)
minutes = Number(second)
seconds = Number(third)
} else if (first && second) // otherwise assume mm:ss
{
minutes = Number(first)
seconds = Number(second)
}
if (minutes > 59 || seconds > 59) {
throw new Error(`Timestamp contains invalid minutes or seconds field '${minutes}::${seconds}'`)
}
let startTime = seconds + minutes * 60 + hours * 60 * 60
if (startTime > audioDurationSecs) {
throw new Error(`Chapter found that starts after over audio duration. Duration: ${audioDurationSecs}s - Chapter start ${startTime}s`)
}
let chapterTitleMatch = chapterTitleRegex.exec(line)
if (chapterTitleMatch == null || chapterTitleMatch.length < 2) {
// Unknown chapter state
throw new Error(`Unable to get chapter title from description, line ${line}`)
}
let chapterTitle = chapterTitleMatch[1].trim()
if (chapterTitle.length > maxChapterTitleLength) {
throw new Error(`Chapter title too long, possible parsing falure, line ${line}`)
}
let chapter = { title: chapterTitle, id: newChapters.length + 1, start: startTime }
if (newChapters.length > 0) {
newChapters[newChapters.length - 1].end = startTime
}
newChapters.push(chapter)
}
if (newChapters.length > 0) {
newChapters[newChapters.length - 1].end = audioDurationSecs
}
if (newChapters.length == 1) {
throw new Error('Only one chapter found, treating as invalid description')
}
return newChapters
}

View file

@ -0,0 +1,166 @@
const chai = require('chai')
const expect = chai.expect
const parsePodcastDescriptionForChapters = require('../../../../server/utils/parsers/parsePodcastDescriptionForChapters')
const sinon = require('sinon')
const Logger = require('../../../../server/Logger')
describe('parsePodcastDescriptionForChapters', () => {
it("should early out if description doens't contain timestamps", () => {
let loggerDebugStub = sinon.stub(Logger, 'debug')
let description = '<p>Introduction text paragraph 1</p><p>Introduction text paragraph 2</p>'
let chapters = parsePodcastDescriptionForChapters.parse(description, 1000)
expect(chapters).to.be.empty
expect(loggerDebugStub.calledWith('No timestamps found in description, bailing out early')).to.be.true
sinon.restore()
})
var testCasesTestingSuccess = [
{
testName: 'Should handle descriptions using html paragraphs',
description: '<p>Introduction text paragraph 1</p><p>Introduction text paragraph 2</p><p>00:48 Chatper 1 </p><p>12:14 Chapter 2 </p><p>20:56 Chapter 3 </p><p>27:34 Chapter 4 </p><p>32:00 Chapter 5 </p><p>35:16 Chapter 6 </p><p>41:32 Chapter 7 </p><p>46:43 Chapter 8</p>',
audioDuration: 3060,
expectedChapters: [
{ title: 'Chatper 1', id: 1, start: 48, end: 734 },
{ title: 'Chapter 2', id: 2, start: 734, end: 1256 },
{ title: 'Chapter 3', id: 3, start: 1256, end: 1654 },
{ title: 'Chapter 4', id: 4, start: 1654, end: 1920 },
{ title: 'Chapter 5', id: 5, start: 1920, end: 2116 },
{ title: 'Chapter 6', id: 6, start: 2116, end: 2492 },
{ title: 'Chapter 7', id: 7, start: 2492, end: 2803 },
{ title: 'Chapter 8', id: 8, start: 2803, end: 3060 }
]
},
{
// Example: https://podcasts.apple.com/us/podcast/giant-bombcast-931-bleepbloop-remote/id274450056?i=1000754550540
testName: 'Should handle descriptions using html line breaks',
description: '<br>Introduction text paragraph 1<br /><br>Introduction text paragraph 2<br /><br />0:00:00 Chapter 1<br />0:17:05 Chapter 2<br />0:33:58 Chapter 3<br />0:40:35 Chapter 4<br />Unrelated outro line<br />',
audioDuration: 2700,
expectedChapters: [
{ title: 'Chapter 1', id: 1, start: 0, end: 1025 },
{ title: 'Chapter 2', id: 2, start: 1025, end: 2038 },
{ title: 'Chapter 3', id: 3, start: 2038, end: 2435 },
{ title: 'Chapter 4', id: 4, start: 2435, end: 2700 }
]
},
{
// Example: https://podcasts.apple.com/us/podcast/xboxs-big-helix-reveal-witcher-4-path-tracing-crimson/id1596728253?i=1000755411491
testName: 'Should handle descriptions using unix new lines',
description: `Introduction text paragraph 1
Introduction text paragraph 2
0:00:00 Chapter 1
0:17:05 Chapter 2
0:33:58 Chapter 3
0:40:35 Chapter 4
Unrelated outro line`,
audioDuration: 2700,
expectedChapters: [
{ title: 'Chapter 1', id: 1, start: 0, end: 1025 },
{ title: 'Chapter 2', id: 2, start: 1025, end: 2038 },
{ title: 'Chapter 3', id: 3, start: 2038, end: 2435 },
{ title: 'Chapter 4', id: 4, start: 2435, end: 2700 }
]
},
{
testName: 'Should handle descriptions with no timestamps',
description: 'Lorem ipsum dolor sit amet consectetur adipiscing elit quisque faucibus ex sapien vitae pellentesque sem placerat in id cursus mi pretium tellus duis convallis tempus leo eu aenean sed diam urna tempor pulvinar vivamus fringilla lacus nec metus bibendum egestas.',
audioDuration: 2700,
expectedChapters: []
},
{
testName: 'Should handle timestampes in parentheses',
description: '<p>Introduction text paragraph 1</p><p>Introduction text paragraph 2</p><p>(00:48) Chatper 1 </p><p>(12:14) Chapter 2 </p><p>(20:56) Chapter 3 </p><p>(27:34) Chapter 4 </p><p>(32:00) Chapter 5 </p><p>(35:16) Chapter 6 </p><p>(41:32) Chapter 7 </p><p>(46:43) Chapter 8</p>',
audioDuration: 3060,
expectedChapters: [
{ title: 'Chatper 1', id: 1, start: 48, end: 734 },
{ title: 'Chapter 2', id: 2, start: 734, end: 1256 },
{ title: 'Chapter 3', id: 3, start: 1256, end: 1654 },
{ title: 'Chapter 4', id: 4, start: 1654, end: 1920 },
{ title: 'Chapter 5', id: 5, start: 1920, end: 2116 },
{ title: 'Chapter 6', id: 6, start: 2116, end: 2492 },
{ title: 'Chapter 7', id: 7, start: 2492, end: 2803 },
{ title: 'Chapter 8', id: 8, start: 2803, end: 3060 }
]
},
{
// Example here: https://podcasts.apple.com/gb/podcast/daniel-priestley-plumbers-will-earn-more-than-lawyers/id1291423644?i=1000755513967
testName: 'Should handle html lists and chapters with html tags in the title',
description: '<p>Introduction</p><p><br /></p><p><br /></p>Chapters<ul><li><strong>00:00:00</strong> Intro</li><li><strong>00:03:55</strong> Chapter 1</li><li><strong>00:09:52</strong> Chapter 2 </li><li><strong>00:16:11</strong> Chapter 3</li><li><strong>00:20:03</strong> Chapter 4</li><li><strong>00:24:08</strong> Chapter 5</li>',
audioDuration: 4000,
expectedChapters: [
{ title: 'Intro', id: 1, start: 0, end: 235 },
{ title: 'Chapter 1', id: 2, start: 235, end: 592 },
{ title: 'Chapter 2', id: 3, start: 592, end: 971 },
{ title: 'Chapter 3', id: 4, start: 971, end: 1203 },
{ title: 'Chapter 4', id: 5, start: 1203, end: 1448 },
{ title: 'Chapter 5', id: 6, start: 1448, end: 4000 }
]
}
]
testCasesTestingSuccess.forEach(function (testCase) {
it(testCase.testName, () => {
var chapters = parsePodcastDescriptionForChapters.parse(testCase.description, testCase.audioDuration)
expect(chapters).to.be.deep.equal(testCase.expectedChapters)
})
})
var testCasesTestingFailure = [
{
testName: 'Should throw if only one chapter found',
description: '<p>Introduction text paragraph 1</p><p>Introduction text paragraph 2</p><p>00:48 Chatper 1 </p>',
audioDuration: 1000,
expectedError: 'Only one chapter found, treating as invalid description'
},
{
testName: 'Should throw if invalid minutes',
description: '<p>Introduction text paragraph 1</p><p>Introduction text paragraph 2</p><p>75:48 Chatper 1 </p>',
audioDuration: 1000,
expectedError: "Timestamp contains invalid minutes or seconds field '75::48'"
},
{
testName: 'Should throw if invalid minutes',
description: '<p>Introduction text paragraph 1</p><p>Introduction text paragraph 2</p><p>00:90 Chatper 1 </p>',
audioDuration: 1000,
expectedError: "Timestamp contains invalid minutes or seconds field '0::90'"
},
{
testName: 'Should throw if chapter goes over lenght of audio file',
description: '<p>Introduction text paragraph 1</p><p>Introduction text paragraph 2</p><p>00:48 Chatper 1 </p><p>01:00:01 Chatper 2 </p>',
audioDuration: 3600,
expectedError: 'Chapter found that starts after over audio duration'
},
{
testName: 'Should throw if description is null',
description: null,
audioDuration: 1000,
expectedError: 'Description must not be null'
},
{
testName: 'Should throw if audio duration is null',
description: '',
audioDuration: null,
expectedError: 'Audio duration must not be null'
},
{
testName: 'Should throw if chapter has no title',
description: '<p>Introduction text paragraph 1</p><p>Introduction text paragraph 2</p><p>00:48 Chatper 1 </p><p>00:30:00</p>',
audioDuration: 3600,
expectedError: 'Unable to get chapter title from description'
},
{
// Example here: https://podcasts.apple.com/us/podcast/is-your-personal-finance-indecision-costing-you-plus/id1256091892?i=1000636624926
testName: 'Should throw if chapter is too long',
description: '<p>01:19 Chapter 1</p><p>10:00 Chapter 2: Lorem ipsum dolor sit amet consectetur adipiscing elit quisque faucibus ex sapien vitae pellentesque sem placerat in id cursus mi pretium tellus duis convallis tempus leo eu aenean sed diam urna tempor pulvinar vivamus fringilla></p>',
audioDuration: 3600,
expectedError: 'Chapter title too long, possible parsing falure'
}
]
testCasesTestingFailure.forEach(function (testCase) {
it(testCase.testName, () => {
expect(() => {
parsePodcastDescriptionForChapters.parse(testCase.description, testCase.audioDuration)
}).to.throw(testCase.expectedError)
})
})
})