diff --git a/client/components/modals/libraries/EditModal.vue b/client/components/modals/libraries/EditModal.vue index a6d8a4d59..3d462164c 100644 --- a/client/components/modals/libraries/EditModal.vue +++ b/client/components/modals/libraries/EditModal.vue @@ -127,7 +127,7 @@ export default { autoScanCronExpression: null, hideSingleBookSeries: false, onlyShowLaterBooksInContinueSeries: false, - metadataPrecedence: ['folderStructure', 'audioMetatags', 'nfoFile', 'txtFiles', 'opfFile', 'absMetadata'], + metadataPrecedence: ['folderStructure', 'audioMetatags', 'nfoFile', 'txtFiles', 'opfFile', 'daisyFile', 'absMetadata'], markAsFinishedPercentComplete: null, markAsFinishedTimeRemaining: 10 } diff --git a/client/components/modals/libraries/LibraryScannerSettings.vue b/client/components/modals/libraries/LibraryScannerSettings.vue index b27925ce4..07f1d15ec 100644 --- a/client/components/modals/libraries/LibraryScannerSettings.vue +++ b/client/components/modals/libraries/LibraryScannerSettings.vue @@ -81,6 +81,11 @@ export default { name: 'OPF file', include: true }, + daisyFile: { + id: 'daisyFile', + name: 'DAISY ncc.html file', + include: true + }, absMetadata: { id: 'absMetadata', name: 'Audiobookshelf metadata file', @@ -157,4 +162,4 @@ export default { this.init() } } - \ No newline at end of file + diff --git a/server/models/Library.js b/server/models/Library.js index 708880aad..72473b466 100644 --- a/server/models/Library.js +++ b/server/models/Library.js @@ -82,7 +82,7 @@ class Library extends Model { } static get defaultMetadataPrecedence() { - return ['folderStructure', 'audioMetatags', 'nfoFile', 'txtFiles', 'opfFile', 'absMetadata'] + return ['folderStructure', 'audioMetatags', 'nfoFile', 'txtFiles', 'opfFile', 'daisyFile', 'absMetadata'] } /** diff --git a/server/scanner/BookScanner.js b/server/scanner/BookScanner.js index a1e7ff507..a10c9565e 100644 --- a/server/scanner/BookScanner.js +++ b/server/scanner/BookScanner.js @@ -23,6 +23,7 @@ const CoverManager = require('../managers/CoverManager') const LibraryScan = require('./LibraryScan') const OpfFileScanner = require('./OpfFileScanner') const NfoFileScanner = require('./NfoFileScanner') +const DaisyFileScanner = require('./DaisyFileScanner') const AbsMetadataFileScanner = require('./AbsMetadataFileScanner') /** @@ -792,6 +793,14 @@ class BookScanner { await OpfFileScanner.scanBookOpfFile(this.libraryItemData.metadataOpfLibraryFile, this.bookMetadata) } + /** + * Metadata from DAISY ncc.html file + */ + async daisyFile() { + if (!this.libraryItemData.metadataDaisyNccLibraryFile) return + await DaisyFileScanner.scanBookDaisyFile(this.libraryItemData.metadataDaisyNccLibraryFile, this.bookMetadata, this.audioFiles) + } + /** * Metadata from metadata.json */ diff --git a/server/scanner/DaisyFileScanner.js b/server/scanner/DaisyFileScanner.js new file mode 100644 index 000000000..217709063 --- /dev/null +++ b/server/scanner/DaisyFileScanner.js @@ -0,0 +1,99 @@ +const { parseDaisyMetadata } = require('../utils/parsers/parseDaisyMetadata') +const { readTextFile } = require('../utils/fileUtils') +const Path = require('path') + +class DaisyFileScanner { + constructor() {} + + /** + * Parse metadata from DAISY ncc.html file found in library scan and update bookMetadata + * + * @param {import('../models/LibraryItem').LibraryFileObject} daisyLibraryFileObj + * @param {Object} bookMetadata + */ + async scanBookDaisyFile(daisyLibraryFileObj, bookMetadata, audioFiles = []) { + const htmlText = await readTextFile(daisyLibraryFileObj.metadata.path) + const daisyMetadata = htmlText ? parseDaisyMetadata(htmlText) : null + if (daisyMetadata) { + for (const key in daisyMetadata) { + if (key === 'tags') { + if (daisyMetadata.tags.length) { + bookMetadata.tags = daisyMetadata.tags + } + } else if (key === 'genres') { + if (daisyMetadata.genres.length) { + bookMetadata.genres = daisyMetadata.genres + } + } else if (key === 'authors') { + if (daisyMetadata.authors?.length) { + bookMetadata.authors = daisyMetadata.authors + } + } else if (key === 'narrators') { + if (daisyMetadata.narrators?.length) { + bookMetadata.narrators = daisyMetadata.narrators + } + } else if (key === 'chapters') { + if (!daisyMetadata.chapters?.length) continue + + // DAISY ncc.html provides chapter names; preserve existing timings if available. + if (bookMetadata.chapters?.length) { + const updatedChapters = bookMetadata.chapters.map((chapter, index) => { + const daisyChapter = daisyMetadata.chapters[index] + if (!daisyChapter?.title) return chapter + return { + ...chapter, + id: chapter.id ?? index, + title: daisyChapter.title + } + }) + bookMetadata.chapters = updatedChapters + } else { + const chaptersFromFiles = this.buildChaptersFromAudioFiles(audioFiles, daisyMetadata.chapters) + if (chaptersFromFiles.length) { + bookMetadata.chapters = chaptersFromFiles + } + } + } else if (daisyMetadata[key]) { + bookMetadata[key] = daisyMetadata[key] + } + } + } + } + + /** + * Build chapter timings from ordered audio files while applying DAISY chapter titles. + * Falls back to file basenames if DAISY has fewer titles than files. + * + * @param {import('../models/Book').AudioFileObject[]} audioFiles + * @param {{title:string}[]} daisyChapters + * @returns {import('../models/Book').ChapterObject[]} + */ + buildChaptersFromAudioFiles(audioFiles, daisyChapters) { + if (!audioFiles?.length) return [] + + const chapters = [] + let currentStartTime = 0 + let chapterId = 0 + + audioFiles.forEach((audioFile) => { + if (!audioFile.duration) return + + const fallbackTitle = audioFile.metadata?.filename + ? Path.basename(audioFile.metadata.filename, Path.extname(audioFile.metadata.filename)) + : `Chapter ${chapterId + 1}` + const title = daisyChapters[chapterId]?.title || fallbackTitle + + chapters.push({ + id: chapterId++, + start: currentStartTime, + end: currentStartTime + audioFile.duration, + title + }) + + currentStartTime += audioFile.duration + }) + + return chapters + } +} +module.exports = new DaisyFileScanner() diff --git a/server/scanner/LibraryItemScanData.js b/server/scanner/LibraryItemScanData.js index d5a4a7a29..985b7342c 100644 --- a/server/scanner/LibraryItemScanData.js +++ b/server/scanner/LibraryItemScanData.js @@ -173,6 +173,11 @@ class LibraryItemScanData { return this.libraryFiles.find(lf => lf.metadata.ext.toLowerCase() === '.nfo') } + /** @type {LibraryItem.LibraryFileObject} */ + get metadataDaisyNccLibraryFile() { + return this.libraryFiles.find(lf => lf.metadata.filename?.toLowerCase() === 'ncc.html') + } + /** * * @param {LibraryItem} existingLibraryItem @@ -374,4 +379,4 @@ class LibraryItemScanData { } } } -module.exports = LibraryItemScanData \ No newline at end of file +module.exports = LibraryItemScanData diff --git a/server/utils/parsers/parseDaisyMetadata.js b/server/utils/parsers/parseDaisyMetadata.js new file mode 100644 index 000000000..6ede05d36 --- /dev/null +++ b/server/utils/parsers/parseDaisyMetadata.js @@ -0,0 +1,177 @@ +const h = require('htmlparser2') +const parseNameString = require('./parseNameString') + +function getValues(metaTags, tagName) { + return metaTags[tagName]?.filter((v) => v) || [] +} + +function getFirstValue(metaTags, tagNames) { + for (const tagName of tagNames) { + const values = getValues(metaTags, tagName) + if (values.length) return values[0] + } + return null +} + +function parseNameValues(values) { + const names = [] + values.forEach((value) => { + const parsedNames = parseNameString.parse(value)?.names || value.split(/\s*;\s*/).filter((n) => n) + parsedNames.forEach((name) => { + if (!names.includes(name)) names.push(name) + }) + }) + return names +} + +function parseStringList(values) { + const items = [] + values.forEach((value) => { + value.split(/\s*[;,]\s*/).forEach((item) => { + if (item && !items.includes(item)) { + items.push(item) + } + }) + }) + return items +} + +function extractYear(str) { + if (!str) return null + const match = str.match(/\d{4}/) + return match ? match[0] : null +} + +function extractIdentifierValue(identifier, identifierType) { + if (!identifier) return null + + const value = identifier.trim() + const expression = identifierType === 'isbn' + ? /(?:^|[^a-z0-9])(97[89][\d\- ]{9,16}[\dx]|[\d\- ]{9,14}[\dx])(?:$|[^a-z0-9])/i + : /(?:^|[^a-z0-9])([a-z0-9]{10})(?:$|[^a-z0-9])/i + + const match = value.match(expression) + if (!match) return null + return (match[1] || match[0]).replace(/^[^a-z0-9]+|[^a-z0-9]+$/gi, '').trim() +} + +function parseIdentifier(metaTags, identifierType) { + const typeTag = identifierType === 'isbn' ? 'dc:identifier:isbn' : 'dc:identifier:asin' + const typedIdentifier = getFirstValue(metaTags, [typeTag, identifierType]) + if (typedIdentifier) { + const extracted = extractIdentifierValue(typedIdentifier, identifierType) + if (extracted) return extracted + } + + const identifierValues = [ + ...getValues(metaTags, 'dc:identifier'), + ...getValues(metaTags, 'ncc:identifier'), + ...(identifierType === 'isbn' ? getValues(metaTags, 'dc:source') : []) + ] + for (const identifier of identifierValues) { + if (identifierType === 'isbn' && /isbn/i.test(identifier)) { + const extracted = extractIdentifierValue(identifier, identifierType) + if (extracted) return extracted + } + if (identifierType === 'asin' && /asin/i.test(identifier)) { + const extracted = extractIdentifierValue(identifier, identifierType) + if (extracted) return extracted + } + } + + for (const identifier of identifierValues) { + const extracted = extractIdentifierValue(identifier, identifierType) + if (extracted) return extracted + } + return null +} + +function parseDaisyMetadata(htmlText) { + if (!htmlText) return null + + const metaTags = {} + let titleText = '' + let inTitle = false + let currentHeadingName = null + let currentHeadingText = '' + const chapterTitles = [] + + const parser = new h.Parser( + { + onopentag: (name, attribs) => { + if (name === 'title') { + inTitle = true + } + if (/^h[1-6]$/.test(name)) { + currentHeadingName = name + currentHeadingText = '' + } + if (name !== 'meta') return + + const tagName = attribs.name?.trim().toLowerCase() + const content = attribs.content?.trim() + if (!tagName || !content) return + + if (!metaTags[tagName]) metaTags[tagName] = [] + metaTags[tagName].push(content) + }, + ontext: (text) => { + if (inTitle) titleText += text + if (currentHeadingName) currentHeadingText += text + }, + onclosetag: (name) => { + if (name === 'title') { + inTitle = false + } + if (name === currentHeadingName) { + const chapterTitle = currentHeadingText.replace(/\s+/g, ' ').trim() + if (chapterTitle) { + chapterTitles.push(chapterTitle) + } + currentHeadingName = null + currentHeadingText = '' + } + } + }, + { decodeEntities: true } + ) + + parser.write(htmlText) + parser.end() + + const creators = parseNameValues(getValues(metaTags, 'dc:creator')) + const narrators = parseNameValues(getValues(metaTags, 'ncc:narrator')) + const subjects = parseStringList([ + ...getValues(metaTags, 'dc:subject'), + ...getValues(metaTags, 'ncc:subject') + ]) + const tags = parseStringList([ + ...getValues(metaTags, 'ncc:keywords'), + ...getValues(metaTags, 'dc:tag') + ]) + + const metadata = { + title: getFirstValue(metaTags, ['dc:title']) || titleText.trim() || null, + authors: creators, + narrators, + publishedYear: extractYear(getFirstValue(metaTags, ['dc:date', 'ncc:revisiondate'])), + publisher: getFirstValue(metaTags, ['dc:publisher']), + description: getFirstValue(metaTags, ['dc:description']), + language: getFirstValue(metaTags, ['dc:language']), + genres: subjects, + tags, + isbn: parseIdentifier(metaTags, 'isbn'), + asin: parseIdentifier(metaTags, 'asin'), + chapters: chapterTitles.map((title) => ({ title })) + } + + for (const key in metadata) { + if (metadata[key] === null) { + delete metadata[key] + } + } + + return metadata +} + +module.exports = { parseDaisyMetadata } diff --git a/server/utils/scandir.js b/server/utils/scandir.js index 6dd2d67fe..b8ddb3b04 100644 --- a/server/utils/scandir.js +++ b/server/utils/scandir.js @@ -24,7 +24,10 @@ function isMediaFile(mediaType, ext, audiobooksOnly = false) { return globals.SupportedAudioTypes.includes(extclean) || globals.SupportedEbookTypes.includes(extclean) } -function isScannableNonMediaFile(ext) { +function isScannableNonMediaFile(ext, filename = '') { + const filenameLower = filename.toLowerCase() + if (filenameLower === 'ncc.html') return true + if (!ext) return false const extclean = ext.slice(1).toLowerCase() return globals.TextFileTypes.includes(extclean) || globals.MetadataFileTypes.includes(extclean) || globals.SupportedImageTypes.includes(extclean) @@ -58,7 +61,7 @@ function groupFileItemsIntoLibraryItemDirs(mediaType, fileItems, audiobooksOnly, /** @type {import('./fileUtils').FilePathItem[]} */ const otherFileItems = [] itemsFiltered.forEach((item) => { - if (isMediaFile(mediaType, item.extension, audiobooksOnly) || (includeNonMediaFiles && isScannableNonMediaFile(item.extension))) { + if (isMediaFile(mediaType, item.extension, audiobooksOnly) || (includeNonMediaFiles && isScannableNonMediaFile(item.extension, item.name))) { mediaFileItems.push(item) } else { otherFileItems.push(item) diff --git a/test/server/utils/parsers/parseDaisyMetadata.test.js b/test/server/utils/parsers/parseDaisyMetadata.test.js new file mode 100644 index 000000000..8a18ce5e6 --- /dev/null +++ b/test/server/utils/parsers/parseDaisyMetadata.test.js @@ -0,0 +1,86 @@ +const chai = require('chai') +const expect = chai.expect +const { parseDaisyMetadata } = require('../../../../server/utils/parsers/parseDaisyMetadata') + +describe('parseDaisyMetadata', () => { + it('returns null if htmlText is empty', () => { + const result = parseDaisyMetadata('') + expect(result).to.be.null + }) + + it('parses common metadata values from DAISY ncc.html', () => { + const nccHtml = ` + + + Fallback Title + + + + + + + + + + + + + ` + + const result = parseDaisyMetadata(nccHtml) + expect(result.title).to.equal('The DAISY Book') + expect(result.authors).to.deep.equal(['Jane Doe', 'Richard Roe']) + expect(result.narrators).to.deep.equal(['Reader One', 'Reader Two']) + expect(result.publisher).to.equal('Talking Books Inc') + expect(result.publishedYear).to.equal('2021') + expect(result.language).to.equal('en') + expect(result.genres).to.deep.equal(['Fiction', 'Mystery']) + expect(result.tags).to.deep.equal(['audio', 'daisy']) + expect(result.isbn).to.equal('978-1-4028-9462-6') + expect(result.asin).to.equal('B012345678') + }) + + it('falls back to title tag when dc:title is not set', () => { + const nccHtml = ` + + + Title From Head + + + ` + const result = parseDaisyMetadata(nccHtml) + expect(result.title).to.equal('Title From Head') + }) + + it('parses isbn from dc:source in DAISY ncc.html', () => { + const nccHtml = ` + + + + + + ` + + const result = parseDaisyMetadata(nccHtml) + expect(result.isbn).to.equal('978-0-553-38016-3') + }) + + it('parses chapter names from heading entries in ncc.html', () => { + const nccHtml = ` + + +

Chapter 1

+

Chapter 2: The Road

+

Part 1

+ + + ` + + const result = parseDaisyMetadata(nccHtml) + expect(result.chapters).to.deep.equal([ + { title: 'Chapter 1' }, + { title: 'Chapter 2: The Road' }, + { title: 'Part 1' } + ]) + }) +}) diff --git a/test/server/utils/scandir.test.js b/test/server/utils/scandir.test.js index a5ff6ae0e..d00c361c4 100644 --- a/test/server/utils/scandir.test.js +++ b/test/server/utils/scandir.test.js @@ -49,4 +49,22 @@ describe('scanUtils', async () => { 'Author/Series2/Book5/deeply/nested': ['cd 01/audiofile.mp3', 'cd 02/audiofile.mp3'] }) }) + + it('should include DAISY ncc.html changes when includeNonMediaFiles is enabled', async () => { + const filePath = 'Author/Book3/ncc.html' + const dirname = Path.dirname(filePath) + const fileItems = [ + { + name: Path.basename(filePath), + reldirpath: dirname === '.' ? '' : dirname, + extension: Path.extname(filePath), + deep: filePath.split('/').length - 1 + } + ] + + const libraryItemGrouping = scanUtils.groupFileItemsIntoLibraryItemDirs('book', fileItems, false, true) + expect(libraryItemGrouping).to.deep.equal({ + 'Author/Book3': ['ncc.html'] + }) + }) })