const sanitizeHtml = require('../../libs/sanitizeHtml') const Logger = require('../../Logger') /** * Parse podcast descriptions for timestamps and generate chapters * The following formats are supports: * * MM:SS Chapter name * HH:MM:SS Chapter name * (HH:MM:SS) Chapter name * * Descriptions have to use

,
or \n to split up lines in order to be supported * * See test suite for more input examples * * @param {string} podcastDescription * @param {number} audioDurationSecs * @returns {ChapterObject[]} */ module.exports.parse = (podcastDescription, audioDurationSecs) => { if (podcastDescription == null) { throw new Error('Description must not be null') } if (audioDurationSecs == null) { throw new Error('Audio duration must not be null') } // This number is arbitrary, but there have been examples where descriptions of the chapter are on the same line as the chapter title // This results in a unpleasant UX where the chapter is very long, it's also possible that an overly long chapter title is the result of a parsing failure const maxChapterTitleLength = 200 const timestampRegex = /\b(\d{1,2}):(\d{1,2})(?::(\d{1,2}))?\b/ const chapterTitleRegex = /\b\d{1,2}:\d{1,2}(?::\d{1,2})?\b(?:\s+|\))(.+)$/ // Split on "

", "
", "\n", const descriptionLineSplitRegex = /\<\s*\/\s*p\s*\>|\<\s*br\s*\/\>|\n|\<\s*\/\s*li\s*\>/ // Early out if there aren't any timestamps in the entire description if (timestampRegex.exec(podcastDescription) == null) { Logger.debug('No timestamps found in description, bailing out early') return [] } var descriptionLines = podcastDescription.split(descriptionLineSplitRegex) var newChapters = [] for (let i = 0; i < descriptionLines.length; i++) { // Strip all HTML tags out let line = sanitizeHtml(descriptionLines[i], { allowedTags: [] }) let match = timestampRegex.exec(line) if (match == null) continue let first = match[1] let second = match[2] let third = match[3] let hours = 0 let minutes = 0 let seconds = 0 // If there's three components then we can assume its hh:mm:ss if (first && second && third) { hours = Number(first) minutes = Number(second) seconds = Number(third) } else if (first && second) // otherwise assume mm:ss { minutes = Number(first) seconds = Number(second) } if (minutes > 59 || seconds > 59) { throw new Error(`Timestamp contains invalid minutes or seconds field '${minutes}::${seconds}'`) } let startTime = seconds + minutes * 60 + hours * 60 * 60 if (startTime > audioDurationSecs) { throw new Error(`Chapter found that starts after over audio duration. Duration: ${audioDurationSecs}s - Chapter start ${startTime}s`) } let chapterTitleMatch = chapterTitleRegex.exec(line) if (chapterTitleMatch == null || chapterTitleMatch.length < 2) { // Unknown chapter state throw new Error(`Unable to get chapter title from description, line ${line}`) } let chapterTitle = chapterTitleMatch[1].trim() if (chapterTitle.length > maxChapterTitleLength) { throw new Error(`Chapter title too long, possible parsing falure, line ${line}`) } let chapter = { title: chapterTitle, id: newChapters.length + 1, start: startTime } if (newChapters.length > 0) { newChapters[newChapters.length - 1].end = startTime } newChapters.push(chapter) } if (newChapters.length > 0) { newChapters[newChapters.length - 1].end = audioDurationSecs } if (newChapters.length == 1) { throw new Error('Only one chapter found, treating as invalid description') } return newChapters }