diff --git a/server/models/PodcastEpisode.js b/server/models/PodcastEpisode.js index 996f55f5..fdef2c50 100644 --- a/server/models/PodcastEpisode.js +++ b/server/models/PodcastEpisode.js @@ -1,5 +1,7 @@ const { DataTypes, Model } = require('sequelize') const libraryItemsPodcastFilters = require('../utils/queries/libraryItemsPodcastFilters') +const parsePodcastDescriptionForChapters = require('../utils/parsers/parsePodcastDescriptionForChapters') +const Logger = require('../Logger') /** * @typedef ChapterObject * @property {number} id @@ -85,6 +87,17 @@ class PodcastEpisode extends Model { podcastEpisode.chapters = audioFile.chapters.map((ch) => ({ ...ch })) } else if (rssPodcastEpisode.chapters?.length) { podcastEpisode.chapters = rssPodcastEpisode.chapters.map((ch) => ({ ...ch })) + } else { + Logger.debug("[PodcastEpisode] New episode doesn't have chapters, attempting to generate them from timestamps", rssPodcastEpisode.title) + try { + podcastEpisode.chapters = parsePodcastDescriptionForChapters.parse(podcastEpisode.description, podcastEpisode.audioFile.duration) + + if (podcastEpisode.chapters.length > 0) { + Logger.info(`[PodcastEpisode] Successfully generated ${podcastEpisode.chapters.length} chapters`) + } + } catch (error) { + Logger.error(`[PodcastEpisode] createFromRssPodcastEpisode: Failed to generate chapters for "${podcastEpisode.title}"`, error) + } } return this.create(podcastEpisode) diff --git a/server/utils/parsers/parsePodcastDescriptionForChapters.js b/server/utils/parsers/parsePodcastDescriptionForChapters.js new file mode 100644 index 00000000..1fa59f83 --- /dev/null +++ b/server/utils/parsers/parsePodcastDescriptionForChapters.js @@ -0,0 +1,112 @@ +const sanitizeHtml = require('../../libs/sanitizeHtml') +const Logger = require('../../Logger') + +/** + * Parse podcast descriptions for timestamps and generate chapters + * The following formats are supports: + * + * MM:SS Chapter name + * HH:MM:SS Chapter name + * (HH:MM:SS) Chapter name + * + * Descriptions have to use

,
or \n to split up lines in order to be supported + * + * See test suite for more input examples + * + * @param {string} podcastDescription + * @param {number} audioDurationSecs + * @returns {ChapterObject[]} + */ +module.exports.parse = (podcastDescription, audioDurationSecs) => { + if (podcastDescription == null) { + throw new Error('Description must not be null') + } + + if (audioDurationSecs == null) { + throw new Error('Audio duration must not be null') + } + + // This number is arbitrary, but there have been examples where descriptions of the chapter are on the same line as the chapter title + // This results in a unpleasant UX where the chapter is very long, it's also possible that an overly long chapter title is the result of a parsing failure + const maxChapterTitleLength = 200 + + const timestampRegex = /\b(\d{1,2}):(\d{1,2})(?::(\d{1,2}))?\b/ + const chapterTitleRegex = /\b\d{1,2}:\d{1,2}(?::\d{1,2})?\b(?:\s+|\))(.+)$/ + + // Split on "

", "
", "\n", + const descriptionLineSplitRegex = /\<\s*\/\s*p\s*\>|\<\s*br\s*\/\>|\n|\<\s*\/\s*li\s*\>/ + + // Early out if there aren't any timestamps in the entire description + if (timestampRegex.exec(podcastDescription) == null) { + Logger.debug('No timestamps found in description, bailing out early') + return [] + } + + var descriptionLines = podcastDescription.split(descriptionLineSplitRegex) + var newChapters = [] + + for (let i = 0; i < descriptionLines.length; i++) { + // Strip all HTML tags out + let line = sanitizeHtml(descriptionLines[i], { allowedTags: [] }) + + let match = timestampRegex.exec(line) + if (match == null) continue + + let first = match[1] + let second = match[2] + let third = match[3] + + let hours = 0 + let minutes = 0 + let seconds = 0 + + // If there's three components then we can assume its hh:mm:ss + if (first && second && third) { + hours = Number(first) + minutes = Number(second) + seconds = Number(third) + } else if (first && second) // otherwise assume mm:ss + { + minutes = Number(first) + seconds = Number(second) + } + + if (minutes > 59 || seconds > 59) { + throw new Error(`Timestamp contains invalid minutes or seconds field '${minutes}::${seconds}'`) + } + + let startTime = seconds + minutes * 60 + hours * 60 * 60 + if (startTime > audioDurationSecs) { + throw new Error(`Chapter found that starts after over audio duration. Duration: ${audioDurationSecs}s - Chapter start ${startTime}s`) + } + + let chapterTitleMatch = chapterTitleRegex.exec(line) + + if (chapterTitleMatch == null || chapterTitleMatch.length < 2) { + // Unknown chapter state + throw new Error(`Unable to get chapter title from description, line ${line}`) + } + + let chapterTitle = chapterTitleMatch[1].trim() + if (chapterTitle.length > maxChapterTitleLength) { + throw new Error(`Chapter title too long, possible parsing falure, line ${line}`) + } + + let chapter = { title: chapterTitle, id: newChapters.length + 1, start: startTime } + + if (newChapters.length > 0) { + newChapters[newChapters.length - 1].end = startTime + } + + newChapters.push(chapter) + } + if (newChapters.length > 0) { + newChapters[newChapters.length - 1].end = audioDurationSecs + } + + if (newChapters.length == 1) { + throw new Error('Only one chapter found, treating as invalid description') + } + + return newChapters +} diff --git a/test/server/utils/parsers/parsePodcastDescriptionForChapters.test.js b/test/server/utils/parsers/parsePodcastDescriptionForChapters.test.js new file mode 100644 index 00000000..c4765415 --- /dev/null +++ b/test/server/utils/parsers/parsePodcastDescriptionForChapters.test.js @@ -0,0 +1,166 @@ +const chai = require('chai') +const expect = chai.expect +const parsePodcastDescriptionForChapters = require('../../../../server/utils/parsers/parsePodcastDescriptionForChapters') +const sinon = require('sinon') +const Logger = require('../../../../server/Logger') + +describe('parsePodcastDescriptionForChapters', () => { + it("should early out if description doens't contain timestamps", () => { + let loggerDebugStub = sinon.stub(Logger, 'debug') + let description = '

Introduction text paragraph 1

Introduction text paragraph 2

' + let chapters = parsePodcastDescriptionForChapters.parse(description, 1000) + + expect(chapters).to.be.empty + expect(loggerDebugStub.calledWith('No timestamps found in description, bailing out early')).to.be.true + + sinon.restore() + }) + + var testCasesTestingSuccess = [ + { + testName: 'Should handle descriptions using html paragraphs', + description: '

Introduction text paragraph 1

Introduction text paragraph 2

00:48 Chatper 1

12:14 Chapter 2

20:56 Chapter 3

27:34 Chapter 4

32:00 Chapter 5

35:16 Chapter 6

41:32 Chapter 7

46:43 Chapter 8

', + audioDuration: 3060, + expectedChapters: [ + { title: 'Chatper 1', id: 1, start: 48, end: 734 }, + { title: 'Chapter 2', id: 2, start: 734, end: 1256 }, + { title: 'Chapter 3', id: 3, start: 1256, end: 1654 }, + { title: 'Chapter 4', id: 4, start: 1654, end: 1920 }, + { title: 'Chapter 5', id: 5, start: 1920, end: 2116 }, + { title: 'Chapter 6', id: 6, start: 2116, end: 2492 }, + { title: 'Chapter 7', id: 7, start: 2492, end: 2803 }, + { title: 'Chapter 8', id: 8, start: 2803, end: 3060 } + ] + }, + { + // Example: https://podcasts.apple.com/us/podcast/giant-bombcast-931-bleepbloop-remote/id274450056?i=1000754550540 + testName: 'Should handle descriptions using html line breaks', + description: '
Introduction text paragraph 1

Introduction text paragraph 2

0:00:00 Chapter 1
0:17:05 Chapter 2
0:33:58 Chapter 3
0:40:35 Chapter 4
Unrelated outro line
', + audioDuration: 2700, + expectedChapters: [ + { title: 'Chapter 1', id: 1, start: 0, end: 1025 }, + { title: 'Chapter 2', id: 2, start: 1025, end: 2038 }, + { title: 'Chapter 3', id: 3, start: 2038, end: 2435 }, + { title: 'Chapter 4', id: 4, start: 2435, end: 2700 } + ] + }, + { + // Example: https://podcasts.apple.com/us/podcast/xboxs-big-helix-reveal-witcher-4-path-tracing-crimson/id1596728253?i=1000755411491 + testName: 'Should handle descriptions using unix new lines', + description: `Introduction text paragraph 1 + Introduction text paragraph 2 + 0:00:00 Chapter 1 + 0:17:05 Chapter 2 + 0:33:58 Chapter 3 + 0:40:35 Chapter 4 + Unrelated outro line`, + audioDuration: 2700, + expectedChapters: [ + { title: 'Chapter 1', id: 1, start: 0, end: 1025 }, + { title: 'Chapter 2', id: 2, start: 1025, end: 2038 }, + { title: 'Chapter 3', id: 3, start: 2038, end: 2435 }, + { title: 'Chapter 4', id: 4, start: 2435, end: 2700 } + ] + }, + { + testName: 'Should handle descriptions with no timestamps', + description: 'Lorem ipsum dolor sit amet consectetur adipiscing elit quisque faucibus ex sapien vitae pellentesque sem placerat in id cursus mi pretium tellus duis convallis tempus leo eu aenean sed diam urna tempor pulvinar vivamus fringilla lacus nec metus bibendum egestas.', + audioDuration: 2700, + expectedChapters: [] + }, + { + testName: 'Should handle timestampes in parentheses', + description: '

Introduction text paragraph 1

Introduction text paragraph 2

(00:48) Chatper 1

(12:14) Chapter 2

(20:56) Chapter 3

(27:34) Chapter 4

(32:00) Chapter 5

(35:16) Chapter 6

(41:32) Chapter 7

(46:43) Chapter 8

', + audioDuration: 3060, + expectedChapters: [ + { title: 'Chatper 1', id: 1, start: 48, end: 734 }, + { title: 'Chapter 2', id: 2, start: 734, end: 1256 }, + { title: 'Chapter 3', id: 3, start: 1256, end: 1654 }, + { title: 'Chapter 4', id: 4, start: 1654, end: 1920 }, + { title: 'Chapter 5', id: 5, start: 1920, end: 2116 }, + { title: 'Chapter 6', id: 6, start: 2116, end: 2492 }, + { title: 'Chapter 7', id: 7, start: 2492, end: 2803 }, + { title: 'Chapter 8', id: 8, start: 2803, end: 3060 } + ] + }, + { + // Example here: https://podcasts.apple.com/gb/podcast/daniel-priestley-plumbers-will-earn-more-than-lawyers/id1291423644?i=1000755513967 + testName: 'Should handle html lists and chapters with html tags in the title', + description: '

Introduction



Chapters