audiobookshelf/server/utils/parsers/parsePodcastDescriptionForChapters.js

112 lines
3.6 KiB
JavaScript

const sanitizeHtml = require('../../libs/sanitizeHtml')
const Logger = require('../../Logger')
/**
* Parse podcast descriptions for timestamps and generate chapters
* The following formats are supports:
*
* MM:SS Chapter name
* HH:MM:SS Chapter name
* (HH:MM:SS) Chapter name
*
* Descriptions have to use <p>, <br> or \n to split up lines in order to be supported
*
* See test suite for more input examples
*
* @param {string} podcastDescription
* @param {number} audioDurationSecs
* @returns {ChapterObject[]}
*/
module.exports.parse = (podcastDescription, audioDurationSecs) => {
if (podcastDescription == null) {
throw new Error('Description must not be null')
}
if (audioDurationSecs == null) {
throw new Error('Audio duration must not be null')
}
// This number is arbitrary, but there have been examples where descriptions of the chapter are on the same line as the chapter title
// This results in a unpleasant UX where the chapter is very long, it's also possible that an overly long chapter title is the result of a parsing failure
const maxChapterTitleLength = 200
const timestampRegex = /\b(\d{1,2}):(\d{1,2})(?::(\d{1,2}))?\b/
const chapterTitleRegex = /\b\d{1,2}:\d{1,2}(?::\d{1,2})?\b(?:\s+|\))(.+)$/
// Split on "</p>", "<br />", "\n", </li>
const descriptionLineSplitRegex = /\<\s*\/\s*p\s*\>|\<\s*br\s*\/\>|\n|\<\s*\/\s*li\s*\>/
// Early out if there aren't any timestamps in the entire description
if (timestampRegex.exec(podcastDescription) == null) {
Logger.debug('No timestamps found in description, bailing out early')
return []
}
var descriptionLines = podcastDescription.split(descriptionLineSplitRegex)
var newChapters = []
for (let i = 0; i < descriptionLines.length; i++) {
// Strip all HTML tags out
let line = sanitizeHtml(descriptionLines[i], { allowedTags: [] })
let match = timestampRegex.exec(line)
if (match == null) continue
let first = match[1]
let second = match[2]
let third = match[3]
let hours = 0
let minutes = 0
let seconds = 0
// If there's three components then we can assume its hh:mm:ss
if (first && second && third) {
hours = Number(first)
minutes = Number(second)
seconds = Number(third)
} else if (first && second) // otherwise assume mm:ss
{
minutes = Number(first)
seconds = Number(second)
}
if (minutes > 59 || seconds > 59) {
throw new Error(`Timestamp contains invalid minutes or seconds field '${minutes}::${seconds}'`)
}
let startTime = seconds + minutes * 60 + hours * 60 * 60
if (startTime > audioDurationSecs) {
throw new Error(`Chapter found that starts after over audio duration. Duration: ${audioDurationSecs}s - Chapter start ${startTime}s`)
}
let chapterTitleMatch = chapterTitleRegex.exec(line)
if (chapterTitleMatch == null || chapterTitleMatch.length < 2) {
// Unknown chapter state
throw new Error(`Unable to get chapter title from description, line ${line}`)
}
let chapterTitle = chapterTitleMatch[1].trim()
if (chapterTitle.length > maxChapterTitleLength) {
throw new Error(`Chapter title too long, possible parsing falure, line ${line}`)
}
let chapter = { title: chapterTitle, id: newChapters.length + 1, start: startTime }
if (newChapters.length > 0) {
newChapters[newChapters.length - 1].end = startTime
}
newChapters.push(chapter)
}
if (newChapters.length > 0) {
newChapters[newChapters.length - 1].end = audioDurationSecs
}
if (newChapters.length == 1) {
throw new Error('Only one chapter found, treating as invalid description')
}
return newChapters
}