From 6c9bf8c2bd42cb8ad06edebe0cb66384a58d8aac Mon Sep 17 00:00:00 2001 From: Toni Barth Date: Sat, 7 Feb 2026 16:45:40 +0100 Subject: [PATCH 1/5] first iteration of parsing metadata and chapter names from ncc.html file --- .../components/modals/libraries/EditModal.vue | 2 +- .../libraries/LibraryScannerSettings.vue | 7 +- server/models/Library.js | 2 +- server/scanner/BookScanner.js | 9 + server/scanner/DaisyFileScanner.js | 99 ++++++++++ server/scanner/LibraryItemScanData.js | 7 +- server/utils/parsers/parseDaisyMetadata.js | 176 ++++++++++++++++++ server/utils/scandir.js | 7 +- .../utils/parsers/parseDaisyMetadata.test.js | 73 ++++++++ test/server/utils/scandir.test.js | 18 ++ 10 files changed, 394 insertions(+), 6 deletions(-) create mode 100644 server/scanner/DaisyFileScanner.js create mode 100644 server/utils/parsers/parseDaisyMetadata.js create mode 100644 test/server/utils/parsers/parseDaisyMetadata.test.js diff --git a/client/components/modals/libraries/EditModal.vue b/client/components/modals/libraries/EditModal.vue index a6d8a4d59..3d462164c 100644 --- a/client/components/modals/libraries/EditModal.vue +++ b/client/components/modals/libraries/EditModal.vue @@ -127,7 +127,7 @@ export default { autoScanCronExpression: null, hideSingleBookSeries: false, onlyShowLaterBooksInContinueSeries: false, - metadataPrecedence: ['folderStructure', 'audioMetatags', 'nfoFile', 'txtFiles', 'opfFile', 'absMetadata'], + metadataPrecedence: ['folderStructure', 'audioMetatags', 'nfoFile', 'txtFiles', 'opfFile', 'daisyFile', 'absMetadata'], markAsFinishedPercentComplete: null, markAsFinishedTimeRemaining: 10 } diff --git a/client/components/modals/libraries/LibraryScannerSettings.vue b/client/components/modals/libraries/LibraryScannerSettings.vue index b27925ce4..07f1d15ec 100644 --- a/client/components/modals/libraries/LibraryScannerSettings.vue +++ b/client/components/modals/libraries/LibraryScannerSettings.vue @@ -81,6 +81,11 @@ export default { name: 'OPF file', include: true }, + daisyFile: { + id: 'daisyFile', + name: 'DAISY ncc.html file', + include: true + }, absMetadata: { id: 'absMetadata', name: 'Audiobookshelf metadata file', @@ -157,4 +162,4 @@ export default { this.init() } } - \ No newline at end of file + diff --git a/server/models/Library.js b/server/models/Library.js index 708880aad..72473b466 100644 --- a/server/models/Library.js +++ b/server/models/Library.js @@ -82,7 +82,7 @@ class Library extends Model { } static get defaultMetadataPrecedence() { - return ['folderStructure', 'audioMetatags', 'nfoFile', 'txtFiles', 'opfFile', 'absMetadata'] + return ['folderStructure', 'audioMetatags', 'nfoFile', 'txtFiles', 'opfFile', 'daisyFile', 'absMetadata'] } /** diff --git a/server/scanner/BookScanner.js b/server/scanner/BookScanner.js index a1e7ff507..a10c9565e 100644 --- a/server/scanner/BookScanner.js +++ b/server/scanner/BookScanner.js @@ -23,6 +23,7 @@ const CoverManager = require('../managers/CoverManager') const LibraryScan = require('./LibraryScan') const OpfFileScanner = require('./OpfFileScanner') const NfoFileScanner = require('./NfoFileScanner') +const DaisyFileScanner = require('./DaisyFileScanner') const AbsMetadataFileScanner = require('./AbsMetadataFileScanner') /** @@ -792,6 +793,14 @@ class BookScanner { await OpfFileScanner.scanBookOpfFile(this.libraryItemData.metadataOpfLibraryFile, this.bookMetadata) } + /** + * Metadata from DAISY ncc.html file + */ + async daisyFile() { + if (!this.libraryItemData.metadataDaisyNccLibraryFile) return + await DaisyFileScanner.scanBookDaisyFile(this.libraryItemData.metadataDaisyNccLibraryFile, this.bookMetadata, this.audioFiles) + } + /** * Metadata from metadata.json */ diff --git a/server/scanner/DaisyFileScanner.js b/server/scanner/DaisyFileScanner.js new file mode 100644 index 000000000..217709063 --- /dev/null +++ b/server/scanner/DaisyFileScanner.js @@ -0,0 +1,99 @@ +const { parseDaisyMetadata } = require('../utils/parsers/parseDaisyMetadata') +const { readTextFile } = require('../utils/fileUtils') +const Path = require('path') + +class DaisyFileScanner { + constructor() {} + + /** + * Parse metadata from DAISY ncc.html file found in library scan and update bookMetadata + * + * @param {import('../models/LibraryItem').LibraryFileObject} daisyLibraryFileObj + * @param {Object} bookMetadata + */ + async scanBookDaisyFile(daisyLibraryFileObj, bookMetadata, audioFiles = []) { + const htmlText = await readTextFile(daisyLibraryFileObj.metadata.path) + const daisyMetadata = htmlText ? parseDaisyMetadata(htmlText) : null + if (daisyMetadata) { + for (const key in daisyMetadata) { + if (key === 'tags') { + if (daisyMetadata.tags.length) { + bookMetadata.tags = daisyMetadata.tags + } + } else if (key === 'genres') { + if (daisyMetadata.genres.length) { + bookMetadata.genres = daisyMetadata.genres + } + } else if (key === 'authors') { + if (daisyMetadata.authors?.length) { + bookMetadata.authors = daisyMetadata.authors + } + } else if (key === 'narrators') { + if (daisyMetadata.narrators?.length) { + bookMetadata.narrators = daisyMetadata.narrators + } + } else if (key === 'chapters') { + if (!daisyMetadata.chapters?.length) continue + + // DAISY ncc.html provides chapter names; preserve existing timings if available. + if (bookMetadata.chapters?.length) { + const updatedChapters = bookMetadata.chapters.map((chapter, index) => { + const daisyChapter = daisyMetadata.chapters[index] + if (!daisyChapter?.title) return chapter + return { + ...chapter, + id: chapter.id ?? index, + title: daisyChapter.title + } + }) + bookMetadata.chapters = updatedChapters + } else { + const chaptersFromFiles = this.buildChaptersFromAudioFiles(audioFiles, daisyMetadata.chapters) + if (chaptersFromFiles.length) { + bookMetadata.chapters = chaptersFromFiles + } + } + } else if (daisyMetadata[key]) { + bookMetadata[key] = daisyMetadata[key] + } + } + } + } + + /** + * Build chapter timings from ordered audio files while applying DAISY chapter titles. + * Falls back to file basenames if DAISY has fewer titles than files. + * + * @param {import('../models/Book').AudioFileObject[]} audioFiles + * @param {{title:string}[]} daisyChapters + * @returns {import('../models/Book').ChapterObject[]} + */ + buildChaptersFromAudioFiles(audioFiles, daisyChapters) { + if (!audioFiles?.length) return [] + + const chapters = [] + let currentStartTime = 0 + let chapterId = 0 + + audioFiles.forEach((audioFile) => { + if (!audioFile.duration) return + + const fallbackTitle = audioFile.metadata?.filename + ? Path.basename(audioFile.metadata.filename, Path.extname(audioFile.metadata.filename)) + : `Chapter ${chapterId + 1}` + const title = daisyChapters[chapterId]?.title || fallbackTitle + + chapters.push({ + id: chapterId++, + start: currentStartTime, + end: currentStartTime + audioFile.duration, + title + }) + + currentStartTime += audioFile.duration + }) + + return chapters + } +} +module.exports = new DaisyFileScanner() diff --git a/server/scanner/LibraryItemScanData.js b/server/scanner/LibraryItemScanData.js index d5a4a7a29..985b7342c 100644 --- a/server/scanner/LibraryItemScanData.js +++ b/server/scanner/LibraryItemScanData.js @@ -173,6 +173,11 @@ class LibraryItemScanData { return this.libraryFiles.find(lf => lf.metadata.ext.toLowerCase() === '.nfo') } + /** @type {LibraryItem.LibraryFileObject} */ + get metadataDaisyNccLibraryFile() { + return this.libraryFiles.find(lf => lf.metadata.filename?.toLowerCase() === 'ncc.html') + } + /** * * @param {LibraryItem} existingLibraryItem @@ -374,4 +379,4 @@ class LibraryItemScanData { } } } -module.exports = LibraryItemScanData \ No newline at end of file +module.exports = LibraryItemScanData diff --git a/server/utils/parsers/parseDaisyMetadata.js b/server/utils/parsers/parseDaisyMetadata.js new file mode 100644 index 000000000..decb06186 --- /dev/null +++ b/server/utils/parsers/parseDaisyMetadata.js @@ -0,0 +1,176 @@ +const h = require('htmlparser2') +const parseNameString = require('./parseNameString') + +function getValues(metaTags, tagName) { + return metaTags[tagName]?.filter((v) => v) || [] +} + +function getFirstValue(metaTags, tagNames) { + for (const tagName of tagNames) { + const values = getValues(metaTags, tagName) + if (values.length) return values[0] + } + return null +} + +function parseNameValues(values) { + const names = [] + values.forEach((value) => { + const parsedNames = parseNameString.parse(value)?.names || value.split(/\s*;\s*/).filter((n) => n) + parsedNames.forEach((name) => { + if (!names.includes(name)) names.push(name) + }) + }) + return names +} + +function parseStringList(values) { + const items = [] + values.forEach((value) => { + value.split(/\s*[;,]\s*/).forEach((item) => { + if (item && !items.includes(item)) { + items.push(item) + } + }) + }) + return items +} + +function extractYear(str) { + if (!str) return null + const match = str.match(/\d{4}/) + return match ? match[0] : null +} + +function extractIdentifierValue(identifier, identifierType) { + if (!identifier) return null + + const value = identifier.trim() + const expression = identifierType === 'isbn' + ? /(?:^|[^a-z0-9])(97[89][\d\- ]{9,16}[\dx]|[\d\- ]{9,14}[\dx])(?:$|[^a-z0-9])/i + : /(?:^|[^a-z0-9])([a-z0-9]{10})(?:$|[^a-z0-9])/i + + const match = value.match(expression) + if (!match) return null + return (match[1] || match[0]).replace(/^[^a-z0-9]+|[^a-z0-9]+$/gi, '').trim() +} + +function parseIdentifier(metaTags, identifierType) { + const typeTag = identifierType === 'isbn' ? 'dc:identifier:isbn' : 'dc:identifier:asin' + const typedIdentifier = getFirstValue(metaTags, [typeTag, identifierType]) + if (typedIdentifier) { + const extracted = extractIdentifierValue(typedIdentifier, identifierType) + if (extracted) return extracted + } + + const identifierValues = [ + ...getValues(metaTags, 'dc:identifier'), + ...getValues(metaTags, 'ncc:identifier') + ] + for (const identifier of identifierValues) { + if (identifierType === 'isbn' && /isbn/i.test(identifier)) { + const extracted = extractIdentifierValue(identifier, identifierType) + if (extracted) return extracted + } + if (identifierType === 'asin' && /asin/i.test(identifier)) { + const extracted = extractIdentifierValue(identifier, identifierType) + if (extracted) return extracted + } + } + + for (const identifier of identifierValues) { + const extracted = extractIdentifierValue(identifier, identifierType) + if (extracted) return extracted + } + return null +} + +function parseDaisyMetadata(htmlText) { + if (!htmlText) return null + + const metaTags = {} + let titleText = '' + let inTitle = false + let currentHeadingName = null + let currentHeadingText = '' + const chapterTitles = [] + + const parser = new h.Parser( + { + onopentag: (name, attribs) => { + if (name === 'title') { + inTitle = true + } + if (/^h[1-6]$/.test(name)) { + currentHeadingName = name + currentHeadingText = '' + } + if (name !== 'meta') return + + const tagName = attribs.name?.trim().toLowerCase() + const content = attribs.content?.trim() + if (!tagName || !content) return + + if (!metaTags[tagName]) metaTags[tagName] = [] + metaTags[tagName].push(content) + }, + ontext: (text) => { + if (inTitle) titleText += text + if (currentHeadingName) currentHeadingText += text + }, + onclosetag: (name) => { + if (name === 'title') { + inTitle = false + } + if (name === currentHeadingName) { + const chapterTitle = currentHeadingText.replace(/\s+/g, ' ').trim() + if (chapterTitle) { + chapterTitles.push(chapterTitle) + } + currentHeadingName = null + currentHeadingText = '' + } + } + }, + { decodeEntities: true } + ) + + parser.write(htmlText) + parser.end() + + const creators = parseNameValues(getValues(metaTags, 'dc:creator')) + const narrators = parseNameValues(getValues(metaTags, 'ncc:narrator')) + const subjects = parseStringList([ + ...getValues(metaTags, 'dc:subject'), + ...getValues(metaTags, 'ncc:subject') + ]) + const tags = parseStringList([ + ...getValues(metaTags, 'ncc:keywords'), + ...getValues(metaTags, 'dc:tag') + ]) + + const metadata = { + title: getFirstValue(metaTags, ['dc:title']) || titleText.trim() || null, + authors: creators, + narrators, + publishedYear: extractYear(getFirstValue(metaTags, ['dc:date', 'ncc:revisiondate'])), + publisher: getFirstValue(metaTags, ['dc:publisher']), + description: getFirstValue(metaTags, ['dc:description']), + language: getFirstValue(metaTags, ['dc:language']), + genres: subjects, + tags, + isbn: parseIdentifier(metaTags, 'isbn'), + asin: parseIdentifier(metaTags, 'asin'), + chapters: chapterTitles.map((title) => ({ title })) + } + + for (const key in metadata) { + if (metadata[key] === null) { + delete metadata[key] + } + } + + return metadata +} + +module.exports = { parseDaisyMetadata } diff --git a/server/utils/scandir.js b/server/utils/scandir.js index 6dd2d67fe..b8ddb3b04 100644 --- a/server/utils/scandir.js +++ b/server/utils/scandir.js @@ -24,7 +24,10 @@ function isMediaFile(mediaType, ext, audiobooksOnly = false) { return globals.SupportedAudioTypes.includes(extclean) || globals.SupportedEbookTypes.includes(extclean) } -function isScannableNonMediaFile(ext) { +function isScannableNonMediaFile(ext, filename = '') { + const filenameLower = filename.toLowerCase() + if (filenameLower === 'ncc.html') return true + if (!ext) return false const extclean = ext.slice(1).toLowerCase() return globals.TextFileTypes.includes(extclean) || globals.MetadataFileTypes.includes(extclean) || globals.SupportedImageTypes.includes(extclean) @@ -58,7 +61,7 @@ function groupFileItemsIntoLibraryItemDirs(mediaType, fileItems, audiobooksOnly, /** @type {import('./fileUtils').FilePathItem[]} */ const otherFileItems = [] itemsFiltered.forEach((item) => { - if (isMediaFile(mediaType, item.extension, audiobooksOnly) || (includeNonMediaFiles && isScannableNonMediaFile(item.extension))) { + if (isMediaFile(mediaType, item.extension, audiobooksOnly) || (includeNonMediaFiles && isScannableNonMediaFile(item.extension, item.name))) { mediaFileItems.push(item) } else { otherFileItems.push(item) diff --git a/test/server/utils/parsers/parseDaisyMetadata.test.js b/test/server/utils/parsers/parseDaisyMetadata.test.js new file mode 100644 index 000000000..2367a9555 --- /dev/null +++ b/test/server/utils/parsers/parseDaisyMetadata.test.js @@ -0,0 +1,73 @@ +const chai = require('chai') +const expect = chai.expect +const { parseDaisyMetadata } = require('../../../../server/utils/parsers/parseDaisyMetadata') + +describe('parseDaisyMetadata', () => { + it('returns null if htmlText is empty', () => { + const result = parseDaisyMetadata('') + expect(result).to.be.null + }) + + it('parses common metadata values from DAISY ncc.html', () => { + const nccHtml = ` + + + Fallback Title + + + + + + + + + + + + + ` + + const result = parseDaisyMetadata(nccHtml) + expect(result.title).to.equal('The DAISY Book') + expect(result.authors).to.deep.equal(['Jane Doe', 'Richard Roe']) + expect(result.narrators).to.deep.equal(['Reader One', 'Reader Two']) + expect(result.publisher).to.equal('Talking Books Inc') + expect(result.publishedYear).to.equal('2021') + expect(result.language).to.equal('en') + expect(result.genres).to.deep.equal(['Fiction', 'Mystery']) + expect(result.tags).to.deep.equal(['audio', 'daisy']) + expect(result.isbn).to.equal('978-1-4028-9462-6') + expect(result.asin).to.equal('B012345678') + }) + + it('falls back to title tag when dc:title is not set', () => { + const nccHtml = ` + + + Title From Head + + + ` + const result = parseDaisyMetadata(nccHtml) + expect(result.title).to.equal('Title From Head') + }) + + it('parses chapter names from heading entries in ncc.html', () => { + const nccHtml = ` + + +

Chapter 1

+

Chapter 2: The Road

+

Part 1

+ + + ` + + const result = parseDaisyMetadata(nccHtml) + expect(result.chapters).to.deep.equal([ + { title: 'Chapter 1' }, + { title: 'Chapter 2: The Road' }, + { title: 'Part 1' } + ]) + }) +}) diff --git a/test/server/utils/scandir.test.js b/test/server/utils/scandir.test.js index a5ff6ae0e..d00c361c4 100644 --- a/test/server/utils/scandir.test.js +++ b/test/server/utils/scandir.test.js @@ -49,4 +49,22 @@ describe('scanUtils', async () => { 'Author/Series2/Book5/deeply/nested': ['cd 01/audiofile.mp3', 'cd 02/audiofile.mp3'] }) }) + + it('should include DAISY ncc.html changes when includeNonMediaFiles is enabled', async () => { + const filePath = 'Author/Book3/ncc.html' + const dirname = Path.dirname(filePath) + const fileItems = [ + { + name: Path.basename(filePath), + reldirpath: dirname === '.' ? '' : dirname, + extension: Path.extname(filePath), + deep: filePath.split('/').length - 1 + } + ] + + const libraryItemGrouping = scanUtils.groupFileItemsIntoLibraryItemDirs('book', fileItems, false, true) + expect(libraryItemGrouping).to.deep.equal({ + 'Author/Book3': ['ncc.html'] + }) + }) }) From 687e62e1fafd1df70f308e1629c7d2cccf5c0db9 Mon Sep 17 00:00:00 2001 From: Toni Barth Date: Sat, 7 Feb 2026 18:00:29 +0100 Subject: [PATCH 2/5] try to properly interpret ncc.html encoding (seems to be a bit weird / incorrect sometimes) --- package-lock.json | 57 +++++++++++++++++++++++++++++ package.json | 2 + server/scanner/DaisyFileScanner.js | 2 +- server/utils/fileUtils.js | 56 ++++++++++++++++++++++++++-- test/server/utils/fileUtils.test.js | 20 ++++++++++ 5 files changed, 132 insertions(+), 5 deletions(-) diff --git a/package-lock.json b/package-lock.json index 08707893d..5fe3c5c74 100644 --- a/package-lock.json +++ b/package-lock.json @@ -15,6 +15,7 @@ "express-rate-limit": "^7.5.1", "express-session": "^1.17.3", "graceful-fs": "^4.2.10", + "html-encoding-sniffer": "^4.0.0", "htmlparser2": "^8.0.1", "lru-cache": "^10.0.3", "node-unrar-js": "^2.0.2", @@ -28,6 +29,7 @@ "socket.io": "^4.5.4", "sqlite3": "^5.1.7", "ssrf-req-filter": "^1.1.0", + "whatwg-encoding": "^3.1.1", "xml2js": "^0.5.0" }, "bin": { @@ -122,6 +124,7 @@ "resolved": "https://registry.npmjs.org/@babel/core/-/core-7.23.3.tgz", "integrity": "sha512-Jg+msLuNuCJDyBvFv5+OKOUjWMZgd85bKjbICd3zWrKAo+bJ49HJufi7CQE0q0uR8NGyO6xkCACScNqyjHSZew==", "dev": true, + "peer": true, "dependencies": { "@ampproject/remapping": "^2.2.0", "@babel/code-frame": "^7.22.13", @@ -1049,6 +1052,7 @@ "url": "https://github.com/sponsors/ai" } ], + "peer": true, "dependencies": { "caniuse-lite": "^1.0.30001541", "electron-to-chromium": "^1.4.535", @@ -1857,6 +1861,7 @@ "version": "4.18.2", "resolved": "https://registry.npmjs.org/express/-/express-4.18.2.tgz", "integrity": "sha512-5/PsL6iGPdfQ/lKM1UuielYgv3BUoJfz1aUwU9vHZ+J7gyvwdQXFEBIEIaxeGf0GIcreATNyBExtalisDbuMqQ==", + "peer": true, "dependencies": { "accepts": "~1.3.8", "array-flatten": "1.1.1", @@ -2113,6 +2118,21 @@ "integrity": "sha512-OO0pH2lK6a0hZnAdau5ItzHPI6pUlvI7jMVnxUQRtw4owF2wk8lOSabtGDCTP4Ggrg2MbGnWO9X8K1t4+fGMDw==", "devOptional": true }, + "node_modules/fsevents": { + "version": "2.3.3", + "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.3.tgz", + "integrity": "sha512-5xoDfX+fL7faATnagmWPpbFtwh/R77WmMMqqHGS65C3vvB0YHrgF+B1YmZ3441tMj5n63k0212XNoJwzlhffQw==", + "dev": true, + "hasInstallScript": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": "^8.16.0 || ^10.6.0 || >=11.0.0" + } + }, "node_modules/function-bind": { "version": "1.1.2", "resolved": "https://registry.npmjs.org/function-bind/-/function-bind-1.1.2.tgz", @@ -2284,6 +2304,18 @@ "he": "bin/he" } }, + "node_modules/html-encoding-sniffer": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/html-encoding-sniffer/-/html-encoding-sniffer-4.0.0.tgz", + "integrity": "sha512-Y22oTqIU4uuPgEemfz7NDJz6OeKf12Lsu+QC+s3BVpda64lTiMYCyGwg5ki4vFxkMwQdeZDl2adZoqUgdFuTgQ==", + "license": "MIT", + "dependencies": { + "whatwg-encoding": "^3.1.1" + }, + "engines": { + "node": ">=18" + } + }, "node_modules/html-escaper": { "version": "2.0.2", "resolved": "https://registry.npmjs.org/html-escaper/-/html-escaper-2.0.2.tgz", @@ -5358,6 +5390,31 @@ "node": ">= 0.8" } }, + "node_modules/whatwg-encoding": { + "version": "3.1.1", + "resolved": "https://registry.npmjs.org/whatwg-encoding/-/whatwg-encoding-3.1.1.tgz", + "integrity": "sha512-6qN4hJdMwfYBtE3YBTTHhoeuUrDBPZmbQaxWAqSALV/MeEnR5z1xd8UKud2RAkFoPkmB+hli1TZSnyi84xz1vQ==", + "deprecated": "Use @exodus/bytes instead for a more spec-conformant and faster implementation", + "license": "MIT", + "dependencies": { + "iconv-lite": "0.6.3" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/whatwg-encoding/node_modules/iconv-lite": { + "version": "0.6.3", + "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.6.3.tgz", + "integrity": "sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw==", + "license": "MIT", + "dependencies": { + "safer-buffer": ">= 2.1.2 < 3.0.0" + }, + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/which": { "version": "2.0.2", "resolved": "https://registry.npmjs.org/which/-/which-2.0.2.tgz", diff --git a/package.json b/package.json index 3ee3fb391..b2de71f0d 100644 --- a/package.json +++ b/package.json @@ -44,6 +44,7 @@ "express-rate-limit": "^7.5.1", "express-session": "^1.17.3", "graceful-fs": "^4.2.10", + "html-encoding-sniffer": "^4.0.0", "htmlparser2": "^8.0.1", "lru-cache": "^10.0.3", "node-unrar-js": "^2.0.2", @@ -57,6 +58,7 @@ "socket.io": "^4.5.4", "sqlite3": "^5.1.7", "ssrf-req-filter": "^1.1.0", + "whatwg-encoding": "^3.1.1", "xml2js": "^0.5.0" }, "devDependencies": { diff --git a/server/scanner/DaisyFileScanner.js b/server/scanner/DaisyFileScanner.js index 217709063..e0b7cd84f 100644 --- a/server/scanner/DaisyFileScanner.js +++ b/server/scanner/DaisyFileScanner.js @@ -12,7 +12,7 @@ class DaisyFileScanner { * @param {Object} bookMetadata */ async scanBookDaisyFile(daisyLibraryFileObj, bookMetadata, audioFiles = []) { - const htmlText = await readTextFile(daisyLibraryFileObj.metadata.path) + const htmlText = await readTextFile(daisyLibraryFileObj.metadata.path, { detectEncoding: true, isHtml: true }) const daisyMetadata = htmlText ? parseDaisyMetadata(htmlText) : null if (daisyMetadata) { for (const key in daisyMetadata) { diff --git a/server/utils/fileUtils.js b/server/utils/fileUtils.js index 9a349bd54..0f79ad09e 100644 --- a/server/utils/fileUtils.js +++ b/server/utils/fileUtils.js @@ -6,6 +6,8 @@ const fs = require('../libs/fsExtra') const rra = require('../libs/recursiveReaddirAsync') const Logger = require('../Logger') const { AudioMimeType } = require('./constants') +const sniffHTMLEncoding = require('html-encoding-sniffer') +const whatwgEncoding = require('whatwg-encoding') /** * Make sure folder separator is POSIX for Windows file paths. e.g. "C:\Users\Abs" becomes "C:/Users/Abs" @@ -116,14 +118,60 @@ function getIno(path) { module.exports.getIno = getIno /** - * Read contents of file - * @param {string} path + * @typedef ReadTextFileOptions + * @property {boolean} [detectEncoding] detect text encoding before decoding + * @property {boolean} [isHtml] use HTML charset hints when detectEncoding is enabled + */ + +function detectTextEncoding(buffer, options = {}) { + const { isHtml = false } = options + if (!isHtml) { + return 'UTF-8' + } + + try { + const sniffedEncoding = sniffHTMLEncoding(buffer, { defaultEncoding: 'windows-1252' }) || 'windows-1252' + return whatwgEncoding.labelToName(sniffedEncoding) || 'UTF-8' + } catch { + return 'UTF-8' + } +} + +/** + * Decode raw text bytes with optional encoding detection. + * + * @param {Buffer} buffer + * @param {ReadTextFileOptions} [options] * @returns {string} */ -async function readTextFile(path) { +function decodeTextBuffer(buffer, options = {}) { + if (!buffer) return '' + const { detectEncoding = false, isHtml = false } = options + + if (!detectEncoding) { + return String(buffer) + } + + const fallbackEncoding = detectTextEncoding(buffer, { isHtml }) + try { + // WHATWG decode handles BOM override and legacy encoding tables. + return whatwgEncoding.decode(buffer, fallbackEncoding) + } catch { + return String(buffer) + } +} +module.exports.decodeTextBuffer = decodeTextBuffer + +/** + * Read contents of file + * @param {string} path + * @param {ReadTextFileOptions} [options] + * @returns {string} + */ +async function readTextFile(path, options = {}) { try { var data = await fs.readFile(path) - return String(data) + return decodeTextBuffer(data, options) } catch (error) { Logger.error(`[FileUtils] ReadTextFile error ${error}`) return '' diff --git a/test/server/utils/fileUtils.test.js b/test/server/utils/fileUtils.test.js index b57a6fb86..a0482f93b 100644 --- a/test/server/utils/fileUtils.test.js +++ b/test/server/utils/fileUtils.test.js @@ -6,6 +6,26 @@ const fs = require('fs') const Logger = require('../../../server/Logger') describe('fileUtils', () => { + describe('decodeTextBuffer', () => { + it('decodes html using charset declaration (windows-1252)', () => { + const htmlPrefix = Buffer.from('M') + const htmlSuffix = Buffer.from('ller') + const input = Buffer.concat([htmlPrefix, Buffer.from([0xfc]), htmlSuffix]) + + const decoded = fileUtils.decodeTextBuffer(input, { detectEncoding: true, isHtml: true }) + expect(decoded).to.include('Müller') + }) + + it('falls back to windows-1252 for html without charset when utf-8 decoding is invalid', () => { + const htmlPrefix = Buffer.from('Gr') + const htmlSuffix = Buffer.from('n') + const input = Buffer.concat([htmlPrefix, Buffer.from([0xfc]), htmlSuffix]) + + const decoded = fileUtils.decodeTextBuffer(input, { detectEncoding: true, isHtml: true }) + expect(decoded).to.include('Grün') + }) + }) + it('shouldIgnoreFile', () => { global.isWin = process.platform === 'win32' From b05acce22b292be506ec71f71546f5e7742cccda Mon Sep 17 00:00:00 2001 From: Toni Barth Date: Sat, 7 Feb 2026 19:32:30 +0100 Subject: [PATCH 3/5] try to replace html sniffing with chardet to fix ncc.html files with set encoding but strings that ignore that --- package-lock.json | 20 +++++++------------- package.json | 2 +- server/utils/fileUtils.js | 26 +++++++++++--------------- 3 files changed, 19 insertions(+), 29 deletions(-) diff --git a/package-lock.json b/package-lock.json index 5fe3c5c74..5deb7ce2d 100644 --- a/package-lock.json +++ b/package-lock.json @@ -10,12 +10,12 @@ "license": "GPL-3.0", "dependencies": { "axios": "^0.27.2", + "chardet": "^2.1.1", "cookie-parser": "^1.4.6", "express": "^4.17.1", "express-rate-limit": "^7.5.1", "express-session": "^1.17.3", "graceful-fs": "^4.2.10", - "html-encoding-sniffer": "^4.0.0", "htmlparser2": "^8.0.1", "lru-cache": "^10.0.3", "node-unrar-js": "^2.0.2", @@ -1255,6 +1255,12 @@ "node": ">=8" } }, + "node_modules/chardet": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/chardet/-/chardet-2.1.1.tgz", + "integrity": "sha512-PsezH1rqdV9VvyNhxxOW32/d75r01NY7TQCmOqomRo15ZSOKbpTFVsfjghxo6JloQUCGnH4k1LGu0R4yCLlWQQ==", + "license": "MIT" + }, "node_modules/check-error": { "version": "1.0.3", "resolved": "https://registry.npmjs.org/check-error/-/check-error-1.0.3.tgz", @@ -2304,18 +2310,6 @@ "he": "bin/he" } }, - "node_modules/html-encoding-sniffer": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/html-encoding-sniffer/-/html-encoding-sniffer-4.0.0.tgz", - "integrity": "sha512-Y22oTqIU4uuPgEemfz7NDJz6OeKf12Lsu+QC+s3BVpda64lTiMYCyGwg5ki4vFxkMwQdeZDl2adZoqUgdFuTgQ==", - "license": "MIT", - "dependencies": { - "whatwg-encoding": "^3.1.1" - }, - "engines": { - "node": ">=18" - } - }, "node_modules/html-escaper": { "version": "2.0.2", "resolved": "https://registry.npmjs.org/html-escaper/-/html-escaper-2.0.2.tgz", diff --git a/package.json b/package.json index b2de71f0d..44be24be1 100644 --- a/package.json +++ b/package.json @@ -39,12 +39,12 @@ "license": "GPL-3.0", "dependencies": { "axios": "^0.27.2", + "chardet": "^2.1.1", "cookie-parser": "^1.4.6", "express": "^4.17.1", "express-rate-limit": "^7.5.1", "express-session": "^1.17.3", "graceful-fs": "^4.2.10", - "html-encoding-sniffer": "^4.0.0", "htmlparser2": "^8.0.1", "lru-cache": "^10.0.3", "node-unrar-js": "^2.0.2", diff --git a/server/utils/fileUtils.js b/server/utils/fileUtils.js index 0f79ad09e..c55261cd9 100644 --- a/server/utils/fileUtils.js +++ b/server/utils/fileUtils.js @@ -6,7 +6,7 @@ const fs = require('../libs/fsExtra') const rra = require('../libs/recursiveReaddirAsync') const Logger = require('../Logger') const { AudioMimeType } = require('./constants') -const sniffHTMLEncoding = require('html-encoding-sniffer') +const chardet = require('chardet') const whatwgEncoding = require('whatwg-encoding') /** @@ -119,22 +119,18 @@ module.exports.getIno = getIno /** * @typedef ReadTextFileOptions - * @property {boolean} [detectEncoding] detect text encoding before decoding - * @property {boolean} [isHtml] use HTML charset hints when detectEncoding is enabled */ -function detectTextEncoding(buffer, options = {}) { - const { isHtml = false } = options - if (!isHtml) { - return 'UTF-8' - } - +function detectTextEncoding(buffer) { try { - const sniffedEncoding = sniffHTMLEncoding(buffer, { defaultEncoding: 'windows-1252' }) || 'windows-1252' - return whatwgEncoding.labelToName(sniffedEncoding) || 'UTF-8' - } catch { - return 'UTF-8' - } + const detectedEncoding = chardet.detect(buffer) + const labeledEncoding = detectedEncoding ? whatwgEncoding.labelToName(detectedEncoding) : null + if (labeledEncoding) { + return labeledEncoding + } + } catch {} + + return 'UTF-8' } /** @@ -152,7 +148,7 @@ function decodeTextBuffer(buffer, options = {}) { return String(buffer) } - const fallbackEncoding = detectTextEncoding(buffer, { isHtml }) + const fallbackEncoding = detectTextEncoding(buffer) try { // WHATWG decode handles BOM override and legacy encoding tables. return whatwgEncoding.decode(buffer, fallbackEncoding) From 52a0b61b976775e80f18d8eb3dff3621a9a915ea Mon Sep 17 00:00:00 2001 From: Toni Barth Date: Sun, 8 Feb 2026 03:33:56 +0100 Subject: [PATCH 4/5] Revert "try to replace html sniffing with chardet to fix ncc.html files with set encoding but strings that ignore that" This reverts commit 3a1be51a830a7a725a32c0ae04de3090786e8722. Revert "try to properly interpret ncc.html encoding (seems to be a bit weird / incorrect sometimes)" This reverts commit fac441559584cfc2f65baadf3475138e7a6017c8. --- package-lock.json | 51 ----------------------------- package.json | 2 -- server/scanner/DaisyFileScanner.js | 2 +- server/utils/fileUtils.js | 48 ++------------------------- test/server/utils/fileUtils.test.js | 20 ----------- 5 files changed, 3 insertions(+), 120 deletions(-) diff --git a/package-lock.json b/package-lock.json index 5deb7ce2d..08707893d 100644 --- a/package-lock.json +++ b/package-lock.json @@ -10,7 +10,6 @@ "license": "GPL-3.0", "dependencies": { "axios": "^0.27.2", - "chardet": "^2.1.1", "cookie-parser": "^1.4.6", "express": "^4.17.1", "express-rate-limit": "^7.5.1", @@ -29,7 +28,6 @@ "socket.io": "^4.5.4", "sqlite3": "^5.1.7", "ssrf-req-filter": "^1.1.0", - "whatwg-encoding": "^3.1.1", "xml2js": "^0.5.0" }, "bin": { @@ -124,7 +122,6 @@ "resolved": "https://registry.npmjs.org/@babel/core/-/core-7.23.3.tgz", "integrity": "sha512-Jg+msLuNuCJDyBvFv5+OKOUjWMZgd85bKjbICd3zWrKAo+bJ49HJufi7CQE0q0uR8NGyO6xkCACScNqyjHSZew==", "dev": true, - "peer": true, "dependencies": { "@ampproject/remapping": "^2.2.0", "@babel/code-frame": "^7.22.13", @@ -1052,7 +1049,6 @@ "url": "https://github.com/sponsors/ai" } ], - "peer": true, "dependencies": { "caniuse-lite": "^1.0.30001541", "electron-to-chromium": "^1.4.535", @@ -1255,12 +1251,6 @@ "node": ">=8" } }, - "node_modules/chardet": { - "version": "2.1.1", - "resolved": "https://registry.npmjs.org/chardet/-/chardet-2.1.1.tgz", - "integrity": "sha512-PsezH1rqdV9VvyNhxxOW32/d75r01NY7TQCmOqomRo15ZSOKbpTFVsfjghxo6JloQUCGnH4k1LGu0R4yCLlWQQ==", - "license": "MIT" - }, "node_modules/check-error": { "version": "1.0.3", "resolved": "https://registry.npmjs.org/check-error/-/check-error-1.0.3.tgz", @@ -1867,7 +1857,6 @@ "version": "4.18.2", "resolved": "https://registry.npmjs.org/express/-/express-4.18.2.tgz", "integrity": "sha512-5/PsL6iGPdfQ/lKM1UuielYgv3BUoJfz1aUwU9vHZ+J7gyvwdQXFEBIEIaxeGf0GIcreATNyBExtalisDbuMqQ==", - "peer": true, "dependencies": { "accepts": "~1.3.8", "array-flatten": "1.1.1", @@ -2124,21 +2113,6 @@ "integrity": "sha512-OO0pH2lK6a0hZnAdau5ItzHPI6pUlvI7jMVnxUQRtw4owF2wk8lOSabtGDCTP4Ggrg2MbGnWO9X8K1t4+fGMDw==", "devOptional": true }, - "node_modules/fsevents": { - "version": "2.3.3", - "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.3.tgz", - "integrity": "sha512-5xoDfX+fL7faATnagmWPpbFtwh/R77WmMMqqHGS65C3vvB0YHrgF+B1YmZ3441tMj5n63k0212XNoJwzlhffQw==", - "dev": true, - "hasInstallScript": true, - "license": "MIT", - "optional": true, - "os": [ - "darwin" - ], - "engines": { - "node": "^8.16.0 || ^10.6.0 || >=11.0.0" - } - }, "node_modules/function-bind": { "version": "1.1.2", "resolved": "https://registry.npmjs.org/function-bind/-/function-bind-1.1.2.tgz", @@ -5384,31 +5358,6 @@ "node": ">= 0.8" } }, - "node_modules/whatwg-encoding": { - "version": "3.1.1", - "resolved": "https://registry.npmjs.org/whatwg-encoding/-/whatwg-encoding-3.1.1.tgz", - "integrity": "sha512-6qN4hJdMwfYBtE3YBTTHhoeuUrDBPZmbQaxWAqSALV/MeEnR5z1xd8UKud2RAkFoPkmB+hli1TZSnyi84xz1vQ==", - "deprecated": "Use @exodus/bytes instead for a more spec-conformant and faster implementation", - "license": "MIT", - "dependencies": { - "iconv-lite": "0.6.3" - }, - "engines": { - "node": ">=18" - } - }, - "node_modules/whatwg-encoding/node_modules/iconv-lite": { - "version": "0.6.3", - "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.6.3.tgz", - "integrity": "sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw==", - "license": "MIT", - "dependencies": { - "safer-buffer": ">= 2.1.2 < 3.0.0" - }, - "engines": { - "node": ">=0.10.0" - } - }, "node_modules/which": { "version": "2.0.2", "resolved": "https://registry.npmjs.org/which/-/which-2.0.2.tgz", diff --git a/package.json b/package.json index 44be24be1..3ee3fb391 100644 --- a/package.json +++ b/package.json @@ -39,7 +39,6 @@ "license": "GPL-3.0", "dependencies": { "axios": "^0.27.2", - "chardet": "^2.1.1", "cookie-parser": "^1.4.6", "express": "^4.17.1", "express-rate-limit": "^7.5.1", @@ -58,7 +57,6 @@ "socket.io": "^4.5.4", "sqlite3": "^5.1.7", "ssrf-req-filter": "^1.1.0", - "whatwg-encoding": "^3.1.1", "xml2js": "^0.5.0" }, "devDependencies": { diff --git a/server/scanner/DaisyFileScanner.js b/server/scanner/DaisyFileScanner.js index e0b7cd84f..217709063 100644 --- a/server/scanner/DaisyFileScanner.js +++ b/server/scanner/DaisyFileScanner.js @@ -12,7 +12,7 @@ class DaisyFileScanner { * @param {Object} bookMetadata */ async scanBookDaisyFile(daisyLibraryFileObj, bookMetadata, audioFiles = []) { - const htmlText = await readTextFile(daisyLibraryFileObj.metadata.path, { detectEncoding: true, isHtml: true }) + const htmlText = await readTextFile(daisyLibraryFileObj.metadata.path) const daisyMetadata = htmlText ? parseDaisyMetadata(htmlText) : null if (daisyMetadata) { for (const key in daisyMetadata) { diff --git a/server/utils/fileUtils.js b/server/utils/fileUtils.js index c55261cd9..9a349bd54 100644 --- a/server/utils/fileUtils.js +++ b/server/utils/fileUtils.js @@ -6,8 +6,6 @@ const fs = require('../libs/fsExtra') const rra = require('../libs/recursiveReaddirAsync') const Logger = require('../Logger') const { AudioMimeType } = require('./constants') -const chardet = require('chardet') -const whatwgEncoding = require('whatwg-encoding') /** * Make sure folder separator is POSIX for Windows file paths. e.g. "C:\Users\Abs" becomes "C:/Users/Abs" @@ -117,57 +115,15 @@ function getIno(path) { } module.exports.getIno = getIno -/** - * @typedef ReadTextFileOptions - */ - -function detectTextEncoding(buffer) { - try { - const detectedEncoding = chardet.detect(buffer) - const labeledEncoding = detectedEncoding ? whatwgEncoding.labelToName(detectedEncoding) : null - if (labeledEncoding) { - return labeledEncoding - } - } catch {} - - return 'UTF-8' -} - -/** - * Decode raw text bytes with optional encoding detection. - * - * @param {Buffer} buffer - * @param {ReadTextFileOptions} [options] - * @returns {string} - */ -function decodeTextBuffer(buffer, options = {}) { - if (!buffer) return '' - const { detectEncoding = false, isHtml = false } = options - - if (!detectEncoding) { - return String(buffer) - } - - const fallbackEncoding = detectTextEncoding(buffer) - try { - // WHATWG decode handles BOM override and legacy encoding tables. - return whatwgEncoding.decode(buffer, fallbackEncoding) - } catch { - return String(buffer) - } -} -module.exports.decodeTextBuffer = decodeTextBuffer - /** * Read contents of file * @param {string} path - * @param {ReadTextFileOptions} [options] * @returns {string} */ -async function readTextFile(path, options = {}) { +async function readTextFile(path) { try { var data = await fs.readFile(path) - return decodeTextBuffer(data, options) + return String(data) } catch (error) { Logger.error(`[FileUtils] ReadTextFile error ${error}`) return '' diff --git a/test/server/utils/fileUtils.test.js b/test/server/utils/fileUtils.test.js index a0482f93b..b57a6fb86 100644 --- a/test/server/utils/fileUtils.test.js +++ b/test/server/utils/fileUtils.test.js @@ -6,26 +6,6 @@ const fs = require('fs') const Logger = require('../../../server/Logger') describe('fileUtils', () => { - describe('decodeTextBuffer', () => { - it('decodes html using charset declaration (windows-1252)', () => { - const htmlPrefix = Buffer.from('M') - const htmlSuffix = Buffer.from('ller') - const input = Buffer.concat([htmlPrefix, Buffer.from([0xfc]), htmlSuffix]) - - const decoded = fileUtils.decodeTextBuffer(input, { detectEncoding: true, isHtml: true }) - expect(decoded).to.include('Müller') - }) - - it('falls back to windows-1252 for html without charset when utf-8 decoding is invalid', () => { - const htmlPrefix = Buffer.from('Gr') - const htmlSuffix = Buffer.from('n') - const input = Buffer.concat([htmlPrefix, Buffer.from([0xfc]), htmlSuffix]) - - const decoded = fileUtils.decodeTextBuffer(input, { detectEncoding: true, isHtml: true }) - expect(decoded).to.include('Grün') - }) - }) - it('shouldIgnoreFile', () => { global.isWin = process.platform === 'win32' From 989255e95774f07e3700b54714ae104253ad6b91 Mon Sep 17 00:00:00 2001 From: Toni Barth Date: Sun, 8 Feb 2026 03:48:22 +0100 Subject: [PATCH 5/5] support isbn tags in dc:source --- server/utils/parsers/parseDaisyMetadata.js | 3 ++- .../server/utils/parsers/parseDaisyMetadata.test.js | 13 +++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/server/utils/parsers/parseDaisyMetadata.js b/server/utils/parsers/parseDaisyMetadata.js index decb06186..6ede05d36 100644 --- a/server/utils/parsers/parseDaisyMetadata.js +++ b/server/utils/parsers/parseDaisyMetadata.js @@ -65,7 +65,8 @@ function parseIdentifier(metaTags, identifierType) { const identifierValues = [ ...getValues(metaTags, 'dc:identifier'), - ...getValues(metaTags, 'ncc:identifier') + ...getValues(metaTags, 'ncc:identifier'), + ...(identifierType === 'isbn' ? getValues(metaTags, 'dc:source') : []) ] for (const identifier of identifierValues) { if (identifierType === 'isbn' && /isbn/i.test(identifier)) { diff --git a/test/server/utils/parsers/parseDaisyMetadata.test.js b/test/server/utils/parsers/parseDaisyMetadata.test.js index 2367a9555..8a18ce5e6 100644 --- a/test/server/utils/parsers/parseDaisyMetadata.test.js +++ b/test/server/utils/parsers/parseDaisyMetadata.test.js @@ -52,6 +52,19 @@ describe('parseDaisyMetadata', () => { expect(result.title).to.equal('Title From Head') }) + it('parses isbn from dc:source in DAISY ncc.html', () => { + const nccHtml = ` + + + + + + ` + + const result = parseDaisyMetadata(nccHtml) + expect(result.isbn).to.equal('978-0-553-38016-3') + }) + it('parses chapter names from heading entries in ncc.html', () => { const nccHtml = `