mirror of
https://github.com/advplyr/audiobookshelf.git
synced 2026-05-12 06:21:30 +00:00
Merge 95fb522e8d into 47ea6b5092
This commit is contained in:
commit
3485db8ee8
3 changed files with 291 additions and 0 deletions
|
|
@ -1,5 +1,7 @@
|
|||
const { DataTypes, Model } = require('sequelize')
|
||||
const libraryItemsPodcastFilters = require('../utils/queries/libraryItemsPodcastFilters')
|
||||
const parsePodcastDescriptionForChapters = require('../utils/parsers/parsePodcastDescriptionForChapters')
|
||||
const Logger = require('../Logger')
|
||||
/**
|
||||
* @typedef ChapterObject
|
||||
* @property {number} id
|
||||
|
|
@ -85,6 +87,17 @@ class PodcastEpisode extends Model {
|
|||
podcastEpisode.chapters = audioFile.chapters.map((ch) => ({ ...ch }))
|
||||
} else if (rssPodcastEpisode.chapters?.length) {
|
||||
podcastEpisode.chapters = rssPodcastEpisode.chapters.map((ch) => ({ ...ch }))
|
||||
} else {
|
||||
Logger.debug("[PodcastEpisode] New episode doesn't have chapters, attempting to generate them from timestamps", rssPodcastEpisode.title)
|
||||
try {
|
||||
podcastEpisode.chapters = parsePodcastDescriptionForChapters.parse(podcastEpisode.description, podcastEpisode.audioFile.duration)
|
||||
|
||||
if (podcastEpisode.chapters.length > 0) {
|
||||
Logger.info(`[PodcastEpisode] Successfully generated ${podcastEpisode.chapters.length} chapters`)
|
||||
}
|
||||
} catch (error) {
|
||||
Logger.error(`[PodcastEpisode] createFromRssPodcastEpisode: Failed to generate chapters for "${podcastEpisode.title}"`, error)
|
||||
}
|
||||
}
|
||||
|
||||
return this.create(podcastEpisode)
|
||||
|
|
|
|||
112
server/utils/parsers/parsePodcastDescriptionForChapters.js
Normal file
112
server/utils/parsers/parsePodcastDescriptionForChapters.js
Normal file
|
|
@ -0,0 +1,112 @@
|
|||
const sanitizeHtml = require('../../libs/sanitizeHtml')
|
||||
const Logger = require('../../Logger')
|
||||
|
||||
/**
|
||||
* Parse podcast descriptions for timestamps and generate chapters
|
||||
* The following formats are supports:
|
||||
*
|
||||
* MM:SS Chapter name
|
||||
* HH:MM:SS Chapter name
|
||||
* (HH:MM:SS) Chapter name
|
||||
*
|
||||
* Descriptions have to use <p>, <br> or \n to split up lines in order to be supported
|
||||
*
|
||||
* See test suite for more input examples
|
||||
*
|
||||
* @param {string} podcastDescription
|
||||
* @param {number} audioDurationSecs
|
||||
* @returns {ChapterObject[]}
|
||||
*/
|
||||
module.exports.parse = (podcastDescription, audioDurationSecs) => {
|
||||
if (podcastDescription == null) {
|
||||
throw new Error('Description must not be null')
|
||||
}
|
||||
|
||||
if (audioDurationSecs == null) {
|
||||
throw new Error('Audio duration must not be null')
|
||||
}
|
||||
|
||||
// This number is arbitrary, but there have been examples where descriptions of the chapter are on the same line as the chapter title
|
||||
// This results in a unpleasant UX where the chapter is very long, it's also possible that an overly long chapter title is the result of a parsing failure
|
||||
const maxChapterTitleLength = 200
|
||||
|
||||
const timestampRegex = /\b(\d{1,2}):(\d{1,2})(?::(\d{1,2}))?\b/
|
||||
const chapterTitleRegex = /\b\d{1,2}:\d{1,2}(?::\d{1,2})?\b(?:\s+|\))(.+)$/
|
||||
|
||||
// Split on "</p>", "<br />", "\n", </li>
|
||||
const descriptionLineSplitRegex = /\<\s*\/\s*p\s*\>|\<\s*br\s*\/\>|\n|\<\s*\/\s*li\s*\>/
|
||||
|
||||
// Early out if there aren't any timestamps in the entire description
|
||||
if (timestampRegex.exec(podcastDescription) == null) {
|
||||
Logger.debug('No timestamps found in description, bailing out early')
|
||||
return []
|
||||
}
|
||||
|
||||
var descriptionLines = podcastDescription.split(descriptionLineSplitRegex)
|
||||
var newChapters = []
|
||||
|
||||
for (let i = 0; i < descriptionLines.length; i++) {
|
||||
// Strip all HTML tags out
|
||||
let line = sanitizeHtml(descriptionLines[i], { allowedTags: [] })
|
||||
|
||||
let match = timestampRegex.exec(line)
|
||||
if (match == null) continue
|
||||
|
||||
let first = match[1]
|
||||
let second = match[2]
|
||||
let third = match[3]
|
||||
|
||||
let hours = 0
|
||||
let minutes = 0
|
||||
let seconds = 0
|
||||
|
||||
// If there's three components then we can assume its hh:mm:ss
|
||||
if (first && second && third) {
|
||||
hours = Number(first)
|
||||
minutes = Number(second)
|
||||
seconds = Number(third)
|
||||
} else if (first && second) // otherwise assume mm:ss
|
||||
{
|
||||
minutes = Number(first)
|
||||
seconds = Number(second)
|
||||
}
|
||||
|
||||
if (minutes > 59 || seconds > 59) {
|
||||
throw new Error(`Timestamp contains invalid minutes or seconds field '${minutes}::${seconds}'`)
|
||||
}
|
||||
|
||||
let startTime = seconds + minutes * 60 + hours * 60 * 60
|
||||
if (startTime > audioDurationSecs) {
|
||||
throw new Error(`Chapter found that starts after over audio duration. Duration: ${audioDurationSecs}s - Chapter start ${startTime}s`)
|
||||
}
|
||||
|
||||
let chapterTitleMatch = chapterTitleRegex.exec(line)
|
||||
|
||||
if (chapterTitleMatch == null || chapterTitleMatch.length < 2) {
|
||||
// Unknown chapter state
|
||||
throw new Error(`Unable to get chapter title from description, line ${line}`)
|
||||
}
|
||||
|
||||
let chapterTitle = chapterTitleMatch[1].trim()
|
||||
if (chapterTitle.length > maxChapterTitleLength) {
|
||||
throw new Error(`Chapter title too long, possible parsing falure, line ${line}`)
|
||||
}
|
||||
|
||||
let chapter = { title: chapterTitle, id: newChapters.length + 1, start: startTime }
|
||||
|
||||
if (newChapters.length > 0) {
|
||||
newChapters[newChapters.length - 1].end = startTime
|
||||
}
|
||||
|
||||
newChapters.push(chapter)
|
||||
}
|
||||
if (newChapters.length > 0) {
|
||||
newChapters[newChapters.length - 1].end = audioDurationSecs
|
||||
}
|
||||
|
||||
if (newChapters.length == 1) {
|
||||
throw new Error('Only one chapter found, treating as invalid description')
|
||||
}
|
||||
|
||||
return newChapters
|
||||
}
|
||||
|
|
@ -0,0 +1,166 @@
|
|||
const chai = require('chai')
|
||||
const expect = chai.expect
|
||||
const parsePodcastDescriptionForChapters = require('../../../../server/utils/parsers/parsePodcastDescriptionForChapters')
|
||||
const sinon = require('sinon')
|
||||
const Logger = require('../../../../server/Logger')
|
||||
|
||||
describe('parsePodcastDescriptionForChapters', () => {
|
||||
it("should early out if description doens't contain timestamps", () => {
|
||||
let loggerDebugStub = sinon.stub(Logger, 'debug')
|
||||
let description = '<p>Introduction text paragraph 1</p><p>Introduction text paragraph 2</p>'
|
||||
let chapters = parsePodcastDescriptionForChapters.parse(description, 1000)
|
||||
|
||||
expect(chapters).to.be.empty
|
||||
expect(loggerDebugStub.calledWith('No timestamps found in description, bailing out early')).to.be.true
|
||||
|
||||
sinon.restore()
|
||||
})
|
||||
|
||||
var testCasesTestingSuccess = [
|
||||
{
|
||||
testName: 'Should handle descriptions using html paragraphs',
|
||||
description: '<p>Introduction text paragraph 1</p><p>Introduction text paragraph 2</p><p>00:48 Chatper 1 </p><p>12:14 Chapter 2 </p><p>20:56 Chapter 3 </p><p>27:34 Chapter 4 </p><p>32:00 Chapter 5 </p><p>35:16 Chapter 6 </p><p>41:32 Chapter 7 </p><p>46:43 Chapter 8</p>',
|
||||
audioDuration: 3060,
|
||||
expectedChapters: [
|
||||
{ title: 'Chatper 1', id: 1, start: 48, end: 734 },
|
||||
{ title: 'Chapter 2', id: 2, start: 734, end: 1256 },
|
||||
{ title: 'Chapter 3', id: 3, start: 1256, end: 1654 },
|
||||
{ title: 'Chapter 4', id: 4, start: 1654, end: 1920 },
|
||||
{ title: 'Chapter 5', id: 5, start: 1920, end: 2116 },
|
||||
{ title: 'Chapter 6', id: 6, start: 2116, end: 2492 },
|
||||
{ title: 'Chapter 7', id: 7, start: 2492, end: 2803 },
|
||||
{ title: 'Chapter 8', id: 8, start: 2803, end: 3060 }
|
||||
]
|
||||
},
|
||||
{
|
||||
// Example: https://podcasts.apple.com/us/podcast/giant-bombcast-931-bleepbloop-remote/id274450056?i=1000754550540
|
||||
testName: 'Should handle descriptions using html line breaks',
|
||||
description: '<br>Introduction text paragraph 1<br /><br>Introduction text paragraph 2<br /><br />0:00:00 Chapter 1<br />0:17:05 Chapter 2<br />0:33:58 Chapter 3<br />0:40:35 Chapter 4<br />Unrelated outro line<br />',
|
||||
audioDuration: 2700,
|
||||
expectedChapters: [
|
||||
{ title: 'Chapter 1', id: 1, start: 0, end: 1025 },
|
||||
{ title: 'Chapter 2', id: 2, start: 1025, end: 2038 },
|
||||
{ title: 'Chapter 3', id: 3, start: 2038, end: 2435 },
|
||||
{ title: 'Chapter 4', id: 4, start: 2435, end: 2700 }
|
||||
]
|
||||
},
|
||||
{
|
||||
// Example: https://podcasts.apple.com/us/podcast/xboxs-big-helix-reveal-witcher-4-path-tracing-crimson/id1596728253?i=1000755411491
|
||||
testName: 'Should handle descriptions using unix new lines',
|
||||
description: `Introduction text paragraph 1
|
||||
Introduction text paragraph 2
|
||||
0:00:00 Chapter 1
|
||||
0:17:05 Chapter 2
|
||||
0:33:58 Chapter 3
|
||||
0:40:35 Chapter 4
|
||||
Unrelated outro line`,
|
||||
audioDuration: 2700,
|
||||
expectedChapters: [
|
||||
{ title: 'Chapter 1', id: 1, start: 0, end: 1025 },
|
||||
{ title: 'Chapter 2', id: 2, start: 1025, end: 2038 },
|
||||
{ title: 'Chapter 3', id: 3, start: 2038, end: 2435 },
|
||||
{ title: 'Chapter 4', id: 4, start: 2435, end: 2700 }
|
||||
]
|
||||
},
|
||||
{
|
||||
testName: 'Should handle descriptions with no timestamps',
|
||||
description: 'Lorem ipsum dolor sit amet consectetur adipiscing elit quisque faucibus ex sapien vitae pellentesque sem placerat in id cursus mi pretium tellus duis convallis tempus leo eu aenean sed diam urna tempor pulvinar vivamus fringilla lacus nec metus bibendum egestas.',
|
||||
audioDuration: 2700,
|
||||
expectedChapters: []
|
||||
},
|
||||
{
|
||||
testName: 'Should handle timestampes in parentheses',
|
||||
description: '<p>Introduction text paragraph 1</p><p>Introduction text paragraph 2</p><p>(00:48) Chatper 1 </p><p>(12:14) Chapter 2 </p><p>(20:56) Chapter 3 </p><p>(27:34) Chapter 4 </p><p>(32:00) Chapter 5 </p><p>(35:16) Chapter 6 </p><p>(41:32) Chapter 7 </p><p>(46:43) Chapter 8</p>',
|
||||
audioDuration: 3060,
|
||||
expectedChapters: [
|
||||
{ title: 'Chatper 1', id: 1, start: 48, end: 734 },
|
||||
{ title: 'Chapter 2', id: 2, start: 734, end: 1256 },
|
||||
{ title: 'Chapter 3', id: 3, start: 1256, end: 1654 },
|
||||
{ title: 'Chapter 4', id: 4, start: 1654, end: 1920 },
|
||||
{ title: 'Chapter 5', id: 5, start: 1920, end: 2116 },
|
||||
{ title: 'Chapter 6', id: 6, start: 2116, end: 2492 },
|
||||
{ title: 'Chapter 7', id: 7, start: 2492, end: 2803 },
|
||||
{ title: 'Chapter 8', id: 8, start: 2803, end: 3060 }
|
||||
]
|
||||
},
|
||||
{
|
||||
// Example here: https://podcasts.apple.com/gb/podcast/daniel-priestley-plumbers-will-earn-more-than-lawyers/id1291423644?i=1000755513967
|
||||
testName: 'Should handle html lists and chapters with html tags in the title',
|
||||
description: '<p>Introduction</p><p><br /></p><p><br /></p>Chapters<ul><li><strong>00:00:00</strong> Intro</li><li><strong>00:03:55</strong> Chapter 1</li><li><strong>00:09:52</strong> Chapter 2 </li><li><strong>00:16:11</strong> Chapter 3</li><li><strong>00:20:03</strong> Chapter 4</li><li><strong>00:24:08</strong> Chapter 5</li>',
|
||||
audioDuration: 4000,
|
||||
expectedChapters: [
|
||||
{ title: 'Intro', id: 1, start: 0, end: 235 },
|
||||
{ title: 'Chapter 1', id: 2, start: 235, end: 592 },
|
||||
{ title: 'Chapter 2', id: 3, start: 592, end: 971 },
|
||||
{ title: 'Chapter 3', id: 4, start: 971, end: 1203 },
|
||||
{ title: 'Chapter 4', id: 5, start: 1203, end: 1448 },
|
||||
{ title: 'Chapter 5', id: 6, start: 1448, end: 4000 }
|
||||
]
|
||||
}
|
||||
]
|
||||
testCasesTestingSuccess.forEach(function (testCase) {
|
||||
it(testCase.testName, () => {
|
||||
var chapters = parsePodcastDescriptionForChapters.parse(testCase.description, testCase.audioDuration)
|
||||
expect(chapters).to.be.deep.equal(testCase.expectedChapters)
|
||||
})
|
||||
})
|
||||
|
||||
var testCasesTestingFailure = [
|
||||
{
|
||||
testName: 'Should throw if only one chapter found',
|
||||
description: '<p>Introduction text paragraph 1</p><p>Introduction text paragraph 2</p><p>00:48 Chatper 1 </p>',
|
||||
audioDuration: 1000,
|
||||
expectedError: 'Only one chapter found, treating as invalid description'
|
||||
},
|
||||
{
|
||||
testName: 'Should throw if invalid minutes',
|
||||
description: '<p>Introduction text paragraph 1</p><p>Introduction text paragraph 2</p><p>75:48 Chatper 1 </p>',
|
||||
audioDuration: 1000,
|
||||
expectedError: "Timestamp contains invalid minutes or seconds field '75::48'"
|
||||
},
|
||||
{
|
||||
testName: 'Should throw if invalid minutes',
|
||||
description: '<p>Introduction text paragraph 1</p><p>Introduction text paragraph 2</p><p>00:90 Chatper 1 </p>',
|
||||
audioDuration: 1000,
|
||||
expectedError: "Timestamp contains invalid minutes or seconds field '0::90'"
|
||||
},
|
||||
{
|
||||
testName: 'Should throw if chapter goes over lenght of audio file',
|
||||
description: '<p>Introduction text paragraph 1</p><p>Introduction text paragraph 2</p><p>00:48 Chatper 1 </p><p>01:00:01 Chatper 2 </p>',
|
||||
audioDuration: 3600,
|
||||
expectedError: 'Chapter found that starts after over audio duration'
|
||||
},
|
||||
{
|
||||
testName: 'Should throw if description is null',
|
||||
description: null,
|
||||
audioDuration: 1000,
|
||||
expectedError: 'Description must not be null'
|
||||
},
|
||||
{
|
||||
testName: 'Should throw if audio duration is null',
|
||||
description: '',
|
||||
audioDuration: null,
|
||||
expectedError: 'Audio duration must not be null'
|
||||
},
|
||||
{
|
||||
testName: 'Should throw if chapter has no title',
|
||||
description: '<p>Introduction text paragraph 1</p><p>Introduction text paragraph 2</p><p>00:48 Chatper 1 </p><p>00:30:00</p>',
|
||||
audioDuration: 3600,
|
||||
expectedError: 'Unable to get chapter title from description'
|
||||
},
|
||||
{
|
||||
// Example here: https://podcasts.apple.com/us/podcast/is-your-personal-finance-indecision-costing-you-plus/id1256091892?i=1000636624926
|
||||
testName: 'Should throw if chapter is too long',
|
||||
description: '<p>01:19 Chapter 1</p><p>10:00 Chapter 2: Lorem ipsum dolor sit amet consectetur adipiscing elit quisque faucibus ex sapien vitae pellentesque sem placerat in id cursus mi pretium tellus duis convallis tempus leo eu aenean sed diam urna tempor pulvinar vivamus fringilla></p>',
|
||||
audioDuration: 3600,
|
||||
expectedError: 'Chapter title too long, possible parsing falure'
|
||||
}
|
||||
]
|
||||
testCasesTestingFailure.forEach(function (testCase) {
|
||||
it(testCase.testName, () => {
|
||||
expect(() => {
|
||||
parsePodcastDescriptionForChapters.parse(testCase.description, testCase.audioDuration)
|
||||
}).to.throw(testCase.expectedError)
|
||||
})
|
||||
})
|
||||
})
|
||||
Loading…
Add table
Add a link
Reference in a new issue