From 3a1be51a830a7a725a32c0ae04de3090786e8722 Mon Sep 17 00:00:00 2001 From: Toni Barth Date: Sat, 7 Feb 2026 19:32:30 +0100 Subject: [PATCH] try to replace html sniffing with chardet to fix ncc.html files with set encoding but strings that ignore that --- package-lock.json | 20 +++++++------------- package.json | 2 +- server/utils/fileUtils.js | 26 +++++++++++--------------- 3 files changed, 19 insertions(+), 29 deletions(-) diff --git a/package-lock.json b/package-lock.json index 5fe3c5c74..5deb7ce2d 100644 --- a/package-lock.json +++ b/package-lock.json @@ -10,12 +10,12 @@ "license": "GPL-3.0", "dependencies": { "axios": "^0.27.2", + "chardet": "^2.1.1", "cookie-parser": "^1.4.6", "express": "^4.17.1", "express-rate-limit": "^7.5.1", "express-session": "^1.17.3", "graceful-fs": "^4.2.10", - "html-encoding-sniffer": "^4.0.0", "htmlparser2": "^8.0.1", "lru-cache": "^10.0.3", "node-unrar-js": "^2.0.2", @@ -1255,6 +1255,12 @@ "node": ">=8" } }, + "node_modules/chardet": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/chardet/-/chardet-2.1.1.tgz", + "integrity": "sha512-PsezH1rqdV9VvyNhxxOW32/d75r01NY7TQCmOqomRo15ZSOKbpTFVsfjghxo6JloQUCGnH4k1LGu0R4yCLlWQQ==", + "license": "MIT" + }, "node_modules/check-error": { "version": "1.0.3", "resolved": "https://registry.npmjs.org/check-error/-/check-error-1.0.3.tgz", @@ -2304,18 +2310,6 @@ "he": "bin/he" } }, - "node_modules/html-encoding-sniffer": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/html-encoding-sniffer/-/html-encoding-sniffer-4.0.0.tgz", - "integrity": "sha512-Y22oTqIU4uuPgEemfz7NDJz6OeKf12Lsu+QC+s3BVpda64lTiMYCyGwg5ki4vFxkMwQdeZDl2adZoqUgdFuTgQ==", - "license": "MIT", - "dependencies": { - "whatwg-encoding": "^3.1.1" - }, - "engines": { - "node": ">=18" - } - }, "node_modules/html-escaper": { "version": "2.0.2", "resolved": "https://registry.npmjs.org/html-escaper/-/html-escaper-2.0.2.tgz", diff --git a/package.json b/package.json index b2de71f0d..44be24be1 100644 --- a/package.json +++ b/package.json @@ -39,12 +39,12 @@ "license": "GPL-3.0", "dependencies": { "axios": "^0.27.2", + "chardet": "^2.1.1", "cookie-parser": "^1.4.6", "express": "^4.17.1", "express-rate-limit": "^7.5.1", "express-session": "^1.17.3", "graceful-fs": "^4.2.10", - "html-encoding-sniffer": "^4.0.0", "htmlparser2": "^8.0.1", "lru-cache": "^10.0.3", "node-unrar-js": "^2.0.2", diff --git a/server/utils/fileUtils.js b/server/utils/fileUtils.js index 0f79ad09e..c55261cd9 100644 --- a/server/utils/fileUtils.js +++ b/server/utils/fileUtils.js @@ -6,7 +6,7 @@ const fs = require('../libs/fsExtra') const rra = require('../libs/recursiveReaddirAsync') const Logger = require('../Logger') const { AudioMimeType } = require('./constants') -const sniffHTMLEncoding = require('html-encoding-sniffer') +const chardet = require('chardet') const whatwgEncoding = require('whatwg-encoding') /** @@ -119,22 +119,18 @@ module.exports.getIno = getIno /** * @typedef ReadTextFileOptions - * @property {boolean} [detectEncoding] detect text encoding before decoding - * @property {boolean} [isHtml] use HTML charset hints when detectEncoding is enabled */ -function detectTextEncoding(buffer, options = {}) { - const { isHtml = false } = options - if (!isHtml) { - return 'UTF-8' - } - +function detectTextEncoding(buffer) { try { - const sniffedEncoding = sniffHTMLEncoding(buffer, { defaultEncoding: 'windows-1252' }) || 'windows-1252' - return whatwgEncoding.labelToName(sniffedEncoding) || 'UTF-8' - } catch { - return 'UTF-8' - } + const detectedEncoding = chardet.detect(buffer) + const labeledEncoding = detectedEncoding ? whatwgEncoding.labelToName(detectedEncoding) : null + if (labeledEncoding) { + return labeledEncoding + } + } catch {} + + return 'UTF-8' } /** @@ -152,7 +148,7 @@ function decodeTextBuffer(buffer, options = {}) { return String(buffer) } - const fallbackEncoding = detectTextEncoding(buffer, { isHtml }) + const fallbackEncoding = detectTextEncoding(buffer) try { // WHATWG decode handles BOM override and legacy encoding tables. return whatwgEncoding.decode(buffer, fallbackEncoding)