diff --git a/server/models/PodcastEpisode.js b/server/models/PodcastEpisode.js index 996f55f5..fdef2c50 100644 --- a/server/models/PodcastEpisode.js +++ b/server/models/PodcastEpisode.js @@ -1,5 +1,7 @@ const { DataTypes, Model } = require('sequelize') const libraryItemsPodcastFilters = require('../utils/queries/libraryItemsPodcastFilters') +const parsePodcastDescriptionForChapters = require('../utils/parsers/parsePodcastDescriptionForChapters') +const Logger = require('../Logger') /** * @typedef ChapterObject * @property {number} id @@ -85,6 +87,17 @@ class PodcastEpisode extends Model { podcastEpisode.chapters = audioFile.chapters.map((ch) => ({ ...ch })) } else if (rssPodcastEpisode.chapters?.length) { podcastEpisode.chapters = rssPodcastEpisode.chapters.map((ch) => ({ ...ch })) + } else { + Logger.debug("[PodcastEpisode] New episode doesn't have chapters, attempting to generate them from timestamps", rssPodcastEpisode.title) + try { + podcastEpisode.chapters = parsePodcastDescriptionForChapters.parse(podcastEpisode.description, podcastEpisode.audioFile.duration) + + if (podcastEpisode.chapters.length > 0) { + Logger.info(`[PodcastEpisode] Successfully generated ${podcastEpisode.chapters.length} chapters`) + } + } catch (error) { + Logger.error(`[PodcastEpisode] createFromRssPodcastEpisode: Failed to generate chapters for "${podcastEpisode.title}"`, error) + } } return this.create(podcastEpisode) diff --git a/server/utils/parsers/parsePodcastDescriptionForChapters.js b/server/utils/parsers/parsePodcastDescriptionForChapters.js new file mode 100644 index 00000000..1fa59f83 --- /dev/null +++ b/server/utils/parsers/parsePodcastDescriptionForChapters.js @@ -0,0 +1,112 @@ +const sanitizeHtml = require('../../libs/sanitizeHtml') +const Logger = require('../../Logger') + +/** + * Parse podcast descriptions for timestamps and generate chapters + * The following formats are supports: + * + * MM:SS Chapter name + * HH:MM:SS Chapter name + * (HH:MM:SS) Chapter name + * + * Descriptions have to use
,
or \n to split up lines in order to be supported
+ *
+ * See test suite for more input examples
+ *
+ * @param {string} podcastDescription
+ * @param {number} audioDurationSecs
+ * @returns {ChapterObject[]}
+ */
+module.exports.parse = (podcastDescription, audioDurationSecs) => {
+ if (podcastDescription == null) {
+ throw new Error('Description must not be null')
+ }
+
+ if (audioDurationSecs == null) {
+ throw new Error('Audio duration must not be null')
+ }
+
+ // This number is arbitrary, but there have been examples where descriptions of the chapter are on the same line as the chapter title
+ // This results in a unpleasant UX where the chapter is very long, it's also possible that an overly long chapter title is the result of a parsing failure
+ const maxChapterTitleLength = 200
+
+ const timestampRegex = /\b(\d{1,2}):(\d{1,2})(?::(\d{1,2}))?\b/
+ const chapterTitleRegex = /\b\d{1,2}:\d{1,2}(?::\d{1,2})?\b(?:\s+|\))(.+)$/
+
+ // Split on "
Introduction text paragraph 1
Introduction text paragraph 2
' + let chapters = parsePodcastDescriptionForChapters.parse(description, 1000) + + expect(chapters).to.be.empty + expect(loggerDebugStub.calledWith('No timestamps found in description, bailing out early')).to.be.true + + sinon.restore() + }) + + var testCasesTestingSuccess = [ + { + testName: 'Should handle descriptions using html paragraphs', + description: 'Introduction text paragraph 1
Introduction text paragraph 2
00:48 Chatper 1
12:14 Chapter 2
20:56 Chapter 3
27:34 Chapter 4
32:00 Chapter 5
35:16 Chapter 6
41:32 Chapter 7
46:43 Chapter 8
', + audioDuration: 3060, + expectedChapters: [ + { title: 'Chatper 1', id: 1, start: 48, end: 734 }, + { title: 'Chapter 2', id: 2, start: 734, end: 1256 }, + { title: 'Chapter 3', id: 3, start: 1256, end: 1654 }, + { title: 'Chapter 4', id: 4, start: 1654, end: 1920 }, + { title: 'Chapter 5', id: 5, start: 1920, end: 2116 }, + { title: 'Chapter 6', id: 6, start: 2116, end: 2492 }, + { title: 'Chapter 7', id: 7, start: 2492, end: 2803 }, + { title: 'Chapter 8', id: 8, start: 2803, end: 3060 } + ] + }, + { + // Example: https://podcasts.apple.com/us/podcast/giant-bombcast-931-bleepbloop-remote/id274450056?i=1000754550540 + testName: 'Should handle descriptions using html line breaks', + description: 'Introduction text paragraph 1
Introduction text paragraph 2
(00:48) Chatper 1
(12:14) Chapter 2
(20:56) Chapter 3
(27:34) Chapter 4
(32:00) Chapter 5
(35:16) Chapter 6
(41:32) Chapter 7
(46:43) Chapter 8
', + audioDuration: 3060, + expectedChapters: [ + { title: 'Chatper 1', id: 1, start: 48, end: 734 }, + { title: 'Chapter 2', id: 2, start: 734, end: 1256 }, + { title: 'Chapter 3', id: 3, start: 1256, end: 1654 }, + { title: 'Chapter 4', id: 4, start: 1654, end: 1920 }, + { title: 'Chapter 5', id: 5, start: 1920, end: 2116 }, + { title: 'Chapter 6', id: 6, start: 2116, end: 2492 }, + { title: 'Chapter 7', id: 7, start: 2492, end: 2803 }, + { title: 'Chapter 8', id: 8, start: 2803, end: 3060 } + ] + }, + { + // Example here: https://podcasts.apple.com/gb/podcast/daniel-priestley-plumbers-will-earn-more-than-lawyers/id1291423644?i=1000755513967 + testName: 'Should handle html lists and chapters with html tags in the title', + description: 'Introduction
Introduction text paragraph 1
Introduction text paragraph 2
00:48 Chatper 1
', + audioDuration: 1000, + expectedError: 'Only one chapter found, treating as invalid description' + }, + { + testName: 'Should throw if invalid minutes', + description: 'Introduction text paragraph 1
Introduction text paragraph 2
75:48 Chatper 1
', + audioDuration: 1000, + expectedError: "Timestamp contains invalid minutes or seconds field '75::48'" + }, + { + testName: 'Should throw if invalid minutes', + description: 'Introduction text paragraph 1
Introduction text paragraph 2
00:90 Chatper 1
', + audioDuration: 1000, + expectedError: "Timestamp contains invalid minutes or seconds field '0::90'" + }, + { + testName: 'Should throw if chapter goes over lenght of audio file', + description: 'Introduction text paragraph 1
Introduction text paragraph 2
00:48 Chatper 1
01:00:01 Chatper 2
', + audioDuration: 3600, + expectedError: 'Chapter found that starts after over audio duration' + }, + { + testName: 'Should throw if description is null', + description: null, + audioDuration: 1000, + expectedError: 'Description must not be null' + }, + { + testName: 'Should throw if audio duration is null', + description: '', + audioDuration: null, + expectedError: 'Audio duration must not be null' + }, + { + testName: 'Should throw if chapter has no title', + description: 'Introduction text paragraph 1
Introduction text paragraph 2
00:48 Chatper 1
00:30:00
', + audioDuration: 3600, + expectedError: 'Unable to get chapter title from description' + }, + { + // Example here: https://podcasts.apple.com/us/podcast/is-your-personal-finance-indecision-costing-you-plus/id1256091892?i=1000636624926 + testName: 'Should throw if chapter is too long', + description: '01:19 Chapter 1
10:00 Chapter 2: Lorem ipsum dolor sit amet consectetur adipiscing elit quisque faucibus ex sapien vitae pellentesque sem placerat in id cursus mi pretium tellus duis convallis tempus leo eu aenean sed diam urna tempor pulvinar vivamus fringilla>
', + audioDuration: 3600, + expectedError: 'Chapter title too long, possible parsing falure' + } + ] + testCasesTestingFailure.forEach(function (testCase) { + it(testCase.testName, () => { + expect(() => { + parsePodcastDescriptionForChapters.parse(testCase.description, testCase.audioDuration) + }).to.throw(testCase.expectedError) + }) + }) +})