From fac441559584cfc2f65baadf3475138e7a6017c8 Mon Sep 17 00:00:00 2001 From: Toni Barth Date: Sat, 7 Feb 2026 18:00:29 +0100 Subject: [PATCH] try to properly interpret ncc.html encoding (seems to be a bit weird / incorrect sometimes) --- package-lock.json | 57 +++++++++++++++++++++++++++++ package.json | 2 + server/scanner/DaisyFileScanner.js | 2 +- server/utils/fileUtils.js | 56 ++++++++++++++++++++++++++-- test/server/utils/fileUtils.test.js | 20 ++++++++++ 5 files changed, 132 insertions(+), 5 deletions(-) diff --git a/package-lock.json b/package-lock.json index 08707893d..5fe3c5c74 100644 --- a/package-lock.json +++ b/package-lock.json @@ -15,6 +15,7 @@ "express-rate-limit": "^7.5.1", "express-session": "^1.17.3", "graceful-fs": "^4.2.10", + "html-encoding-sniffer": "^4.0.0", "htmlparser2": "^8.0.1", "lru-cache": "^10.0.3", "node-unrar-js": "^2.0.2", @@ -28,6 +29,7 @@ "socket.io": "^4.5.4", "sqlite3": "^5.1.7", "ssrf-req-filter": "^1.1.0", + "whatwg-encoding": "^3.1.1", "xml2js": "^0.5.0" }, "bin": { @@ -122,6 +124,7 @@ "resolved": "https://registry.npmjs.org/@babel/core/-/core-7.23.3.tgz", "integrity": "sha512-Jg+msLuNuCJDyBvFv5+OKOUjWMZgd85bKjbICd3zWrKAo+bJ49HJufi7CQE0q0uR8NGyO6xkCACScNqyjHSZew==", "dev": true, + "peer": true, "dependencies": { "@ampproject/remapping": "^2.2.0", "@babel/code-frame": "^7.22.13", @@ -1049,6 +1052,7 @@ "url": "https://github.com/sponsors/ai" } ], + "peer": true, "dependencies": { "caniuse-lite": "^1.0.30001541", "electron-to-chromium": "^1.4.535", @@ -1857,6 +1861,7 @@ "version": "4.18.2", "resolved": "https://registry.npmjs.org/express/-/express-4.18.2.tgz", "integrity": "sha512-5/PsL6iGPdfQ/lKM1UuielYgv3BUoJfz1aUwU9vHZ+J7gyvwdQXFEBIEIaxeGf0GIcreATNyBExtalisDbuMqQ==", + "peer": true, "dependencies": { "accepts": "~1.3.8", "array-flatten": "1.1.1", @@ -2113,6 +2118,21 @@ "integrity": "sha512-OO0pH2lK6a0hZnAdau5ItzHPI6pUlvI7jMVnxUQRtw4owF2wk8lOSabtGDCTP4Ggrg2MbGnWO9X8K1t4+fGMDw==", "devOptional": true }, + "node_modules/fsevents": { + "version": "2.3.3", + "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.3.tgz", + "integrity": "sha512-5xoDfX+fL7faATnagmWPpbFtwh/R77WmMMqqHGS65C3vvB0YHrgF+B1YmZ3441tMj5n63k0212XNoJwzlhffQw==", + "dev": true, + "hasInstallScript": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": "^8.16.0 || ^10.6.0 || >=11.0.0" + } + }, "node_modules/function-bind": { "version": "1.1.2", "resolved": "https://registry.npmjs.org/function-bind/-/function-bind-1.1.2.tgz", @@ -2284,6 +2304,18 @@ "he": "bin/he" } }, + "node_modules/html-encoding-sniffer": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/html-encoding-sniffer/-/html-encoding-sniffer-4.0.0.tgz", + "integrity": "sha512-Y22oTqIU4uuPgEemfz7NDJz6OeKf12Lsu+QC+s3BVpda64lTiMYCyGwg5ki4vFxkMwQdeZDl2adZoqUgdFuTgQ==", + "license": "MIT", + "dependencies": { + "whatwg-encoding": "^3.1.1" + }, + "engines": { + "node": ">=18" + } + }, "node_modules/html-escaper": { "version": "2.0.2", "resolved": "https://registry.npmjs.org/html-escaper/-/html-escaper-2.0.2.tgz", @@ -5358,6 +5390,31 @@ "node": ">= 0.8" } }, + "node_modules/whatwg-encoding": { + "version": "3.1.1", + "resolved": "https://registry.npmjs.org/whatwg-encoding/-/whatwg-encoding-3.1.1.tgz", + "integrity": "sha512-6qN4hJdMwfYBtE3YBTTHhoeuUrDBPZmbQaxWAqSALV/MeEnR5z1xd8UKud2RAkFoPkmB+hli1TZSnyi84xz1vQ==", + "deprecated": "Use @exodus/bytes instead for a more spec-conformant and faster implementation", + "license": "MIT", + "dependencies": { + "iconv-lite": "0.6.3" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/whatwg-encoding/node_modules/iconv-lite": { + "version": "0.6.3", + "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.6.3.tgz", + "integrity": "sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw==", + "license": "MIT", + "dependencies": { + "safer-buffer": ">= 2.1.2 < 3.0.0" + }, + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/which": { "version": "2.0.2", "resolved": "https://registry.npmjs.org/which/-/which-2.0.2.tgz", diff --git a/package.json b/package.json index 3ee3fb391..b2de71f0d 100644 --- a/package.json +++ b/package.json @@ -44,6 +44,7 @@ "express-rate-limit": "^7.5.1", "express-session": "^1.17.3", "graceful-fs": "^4.2.10", + "html-encoding-sniffer": "^4.0.0", "htmlparser2": "^8.0.1", "lru-cache": "^10.0.3", "node-unrar-js": "^2.0.2", @@ -57,6 +58,7 @@ "socket.io": "^4.5.4", "sqlite3": "^5.1.7", "ssrf-req-filter": "^1.1.0", + "whatwg-encoding": "^3.1.1", "xml2js": "^0.5.0" }, "devDependencies": { diff --git a/server/scanner/DaisyFileScanner.js b/server/scanner/DaisyFileScanner.js index 217709063..e0b7cd84f 100644 --- a/server/scanner/DaisyFileScanner.js +++ b/server/scanner/DaisyFileScanner.js @@ -12,7 +12,7 @@ class DaisyFileScanner { * @param {Object} bookMetadata */ async scanBookDaisyFile(daisyLibraryFileObj, bookMetadata, audioFiles = []) { - const htmlText = await readTextFile(daisyLibraryFileObj.metadata.path) + const htmlText = await readTextFile(daisyLibraryFileObj.metadata.path, { detectEncoding: true, isHtml: true }) const daisyMetadata = htmlText ? parseDaisyMetadata(htmlText) : null if (daisyMetadata) { for (const key in daisyMetadata) { diff --git a/server/utils/fileUtils.js b/server/utils/fileUtils.js index 9a349bd54..0f79ad09e 100644 --- a/server/utils/fileUtils.js +++ b/server/utils/fileUtils.js @@ -6,6 +6,8 @@ const fs = require('../libs/fsExtra') const rra = require('../libs/recursiveReaddirAsync') const Logger = require('../Logger') const { AudioMimeType } = require('./constants') +const sniffHTMLEncoding = require('html-encoding-sniffer') +const whatwgEncoding = require('whatwg-encoding') /** * Make sure folder separator is POSIX for Windows file paths. e.g. "C:\Users\Abs" becomes "C:/Users/Abs" @@ -116,14 +118,60 @@ function getIno(path) { module.exports.getIno = getIno /** - * Read contents of file - * @param {string} path + * @typedef ReadTextFileOptions + * @property {boolean} [detectEncoding] detect text encoding before decoding + * @property {boolean} [isHtml] use HTML charset hints when detectEncoding is enabled + */ + +function detectTextEncoding(buffer, options = {}) { + const { isHtml = false } = options + if (!isHtml) { + return 'UTF-8' + } + + try { + const sniffedEncoding = sniffHTMLEncoding(buffer, { defaultEncoding: 'windows-1252' }) || 'windows-1252' + return whatwgEncoding.labelToName(sniffedEncoding) || 'UTF-8' + } catch { + return 'UTF-8' + } +} + +/** + * Decode raw text bytes with optional encoding detection. + * + * @param {Buffer} buffer + * @param {ReadTextFileOptions} [options] * @returns {string} */ -async function readTextFile(path) { +function decodeTextBuffer(buffer, options = {}) { + if (!buffer) return '' + const { detectEncoding = false, isHtml = false } = options + + if (!detectEncoding) { + return String(buffer) + } + + const fallbackEncoding = detectTextEncoding(buffer, { isHtml }) + try { + // WHATWG decode handles BOM override and legacy encoding tables. + return whatwgEncoding.decode(buffer, fallbackEncoding) + } catch { + return String(buffer) + } +} +module.exports.decodeTextBuffer = decodeTextBuffer + +/** + * Read contents of file + * @param {string} path + * @param {ReadTextFileOptions} [options] + * @returns {string} + */ +async function readTextFile(path, options = {}) { try { var data = await fs.readFile(path) - return String(data) + return decodeTextBuffer(data, options) } catch (error) { Logger.error(`[FileUtils] ReadTextFile error ${error}`) return '' diff --git a/test/server/utils/fileUtils.test.js b/test/server/utils/fileUtils.test.js index b57a6fb86..a0482f93b 100644 --- a/test/server/utils/fileUtils.test.js +++ b/test/server/utils/fileUtils.test.js @@ -6,6 +6,26 @@ const fs = require('fs') const Logger = require('../../../server/Logger') describe('fileUtils', () => { + describe('decodeTextBuffer', () => { + it('decodes html using charset declaration (windows-1252)', () => { + const htmlPrefix = Buffer.from('M') + const htmlSuffix = Buffer.from('ller') + const input = Buffer.concat([htmlPrefix, Buffer.from([0xfc]), htmlSuffix]) + + const decoded = fileUtils.decodeTextBuffer(input, { detectEncoding: true, isHtml: true }) + expect(decoded).to.include('Müller') + }) + + it('falls back to windows-1252 for html without charset when utf-8 decoding is invalid', () => { + const htmlPrefix = Buffer.from('Gr') + const htmlSuffix = Buffer.from('n') + const input = Buffer.concat([htmlPrefix, Buffer.from([0xfc]), htmlSuffix]) + + const decoded = fileUtils.decodeTextBuffer(input, { detectEncoding: true, isHtml: true }) + expect(decoded).to.include('Grün') + }) + }) + it('shouldIgnoreFile', () => { global.isWin = process.platform === 'win32'