2026-03-16 20:59:00 +00:00
const sanitizeHtml = require ( '../../libs/sanitizeHtml' )
2026-03-16 18:42:01 +00:00
const Logger = require ( '../../Logger' )
/ * *
* Parse podcast descriptions for timestamps and generate chapters
* The following formats are supports :
*
* MM : SS Chapter name
* HH : MM : SS Chapter name
* ( HH : MM : SS ) Chapter name
*
* Descriptions have to use < p > , < br > or \ n to split up lines in order to be supported
*
* See test suite for more input examples
*
* @ param { string } podcastDescription
* @ param { number } audioDurationSecs
* @ returns { ChapterObject [ ] }
* /
module . exports . parse = ( podcastDescription , audioDurationSecs ) => {
if ( podcastDescription == null ) {
throw new Error ( 'Description must not be null' )
}
if ( audioDurationSecs == null ) {
throw new Error ( 'Audio duration must not be null' )
}
2026-03-17 18:52:56 +00:00
// This number is arbitrary, but there have been examples where descriptions of the chapter are on the same line as the chapter title
// This results in a unpleasant UX where the chapter is very long, it's also possible that an overly long chapter title is the result of a parsing failure
const maxChapterTitleLength = 200
2026-03-16 18:42:01 +00:00
const timestampRegex = /\b(\d{1,2}):(\d{1,2})(?::(\d{1,2}))?\b/
const chapterTitleRegex = /\b\d{1,2}:\d{1,2}(?::\d{1,2})?\b(?:\s+|\))(.+)$/
2026-03-16 20:59:00 +00:00
// Split on "</p>", "<br />", "\n", </li>
const descriptionLineSplitRegex = /\<\s*\/\s*p\s*\>|\<\s*br\s*\/\>|\n|\<\s*\/\s*li\s*\>/
2026-03-16 18:42:01 +00:00
var descriptionLines = podcastDescription . split ( descriptionLineSplitRegex )
var newChapters = [ ]
for ( let i = 0 ; i < descriptionLines . length ; i ++ ) {
2026-03-16 20:59:00 +00:00
// Strip all HTML tags out
let line = sanitizeHtml ( descriptionLines [ i ] , { allowedTags : [ ] } )
2026-03-16 18:42:01 +00:00
let match = timestampRegex . exec ( line )
if ( match == null ) continue
let first = match [ 1 ]
let second = match [ 2 ]
let third = match [ 3 ]
let hours = 0
let minutes = 0
let seconds = 0
// If there's three components then we can assume its hh:mm:ss
if ( first && second && third ) {
hours = Number ( first )
minutes = Number ( second )
seconds = Number ( third )
} else if ( first && second ) // otherwise assume mm:ss
{
minutes = Number ( first )
seconds = Number ( second )
}
if ( minutes > 59 || seconds > 59 ) {
throw new Error ( ` Timestamp contains invalid minutes or seconds field ' ${ minutes } :: ${ seconds } ' ` )
}
let startTime = seconds + minutes * 60 + hours * 60 * 60
if ( startTime > audioDurationSecs ) {
throw new Error ( ` Chapter found that starts after over audio duration. Duration: ${ audioDurationSecs } s - Chapter start ${ startTime } s ` )
}
let chapterTitleMatch = chapterTitleRegex . exec ( line )
if ( chapterTitleMatch == null || chapterTitleMatch . length < 2 ) {
// Unknown chapter state
throw new Error ( ` Unable to get chapter title from description, line ${ line } ` )
}
2026-03-17 18:52:56 +00:00
let chapterTitle = chapterTitleMatch [ 1 ] . trim ( )
if ( chapterTitle . length > maxChapterTitleLength ) {
throw new Error ( ` Chapter title too long, possible parsing falure, line ${ line } ` )
}
let chapter = { title : chapterTitle , id : newChapters . length + 1 , start : startTime }
2026-03-16 18:42:01 +00:00
if ( newChapters . length > 0 ) {
newChapters [ newChapters . length - 1 ] . end = startTime
}
newChapters . push ( chapter )
}
if ( newChapters . length > 0 ) {
newChapters [ newChapters . length - 1 ] . end = audioDurationSecs
}
2026-03-16 20:59:00 +00:00
Logger . info ( ` Successfully generated ${ newChapters . length } chapters ` )
2026-03-16 18:42:01 +00:00
if ( newChapters . length == 1 ) {
throw new Error ( 'Only one chapter found, treating as invalid description' )
}
return newChapters
}