try to replace html sniffing with chardet to fix ncc.html files with set encoding but strings that ignore that

This commit is contained in:
Toni Barth 2026-02-07 19:32:30 +01:00
parent fac4415595
commit 3a1be51a83
3 changed files with 19 additions and 29 deletions

20
package-lock.json generated
View file

@ -10,12 +10,12 @@
"license": "GPL-3.0",
"dependencies": {
"axios": "^0.27.2",
"chardet": "^2.1.1",
"cookie-parser": "^1.4.6",
"express": "^4.17.1",
"express-rate-limit": "^7.5.1",
"express-session": "^1.17.3",
"graceful-fs": "^4.2.10",
"html-encoding-sniffer": "^4.0.0",
"htmlparser2": "^8.0.1",
"lru-cache": "^10.0.3",
"node-unrar-js": "^2.0.2",
@ -1255,6 +1255,12 @@
"node": ">=8"
}
},
"node_modules/chardet": {
"version": "2.1.1",
"resolved": "https://registry.npmjs.org/chardet/-/chardet-2.1.1.tgz",
"integrity": "sha512-PsezH1rqdV9VvyNhxxOW32/d75r01NY7TQCmOqomRo15ZSOKbpTFVsfjghxo6JloQUCGnH4k1LGu0R4yCLlWQQ==",
"license": "MIT"
},
"node_modules/check-error": {
"version": "1.0.3",
"resolved": "https://registry.npmjs.org/check-error/-/check-error-1.0.3.tgz",
@ -2304,18 +2310,6 @@
"he": "bin/he"
}
},
"node_modules/html-encoding-sniffer": {
"version": "4.0.0",
"resolved": "https://registry.npmjs.org/html-encoding-sniffer/-/html-encoding-sniffer-4.0.0.tgz",
"integrity": "sha512-Y22oTqIU4uuPgEemfz7NDJz6OeKf12Lsu+QC+s3BVpda64lTiMYCyGwg5ki4vFxkMwQdeZDl2adZoqUgdFuTgQ==",
"license": "MIT",
"dependencies": {
"whatwg-encoding": "^3.1.1"
},
"engines": {
"node": ">=18"
}
},
"node_modules/html-escaper": {
"version": "2.0.2",
"resolved": "https://registry.npmjs.org/html-escaper/-/html-escaper-2.0.2.tgz",

View file

@ -39,12 +39,12 @@
"license": "GPL-3.0",
"dependencies": {
"axios": "^0.27.2",
"chardet": "^2.1.1",
"cookie-parser": "^1.4.6",
"express": "^4.17.1",
"express-rate-limit": "^7.5.1",
"express-session": "^1.17.3",
"graceful-fs": "^4.2.10",
"html-encoding-sniffer": "^4.0.0",
"htmlparser2": "^8.0.1",
"lru-cache": "^10.0.3",
"node-unrar-js": "^2.0.2",

View file

@ -6,7 +6,7 @@ const fs = require('../libs/fsExtra')
const rra = require('../libs/recursiveReaddirAsync')
const Logger = require('../Logger')
const { AudioMimeType } = require('./constants')
const sniffHTMLEncoding = require('html-encoding-sniffer')
const chardet = require('chardet')
const whatwgEncoding = require('whatwg-encoding')
/**
@ -119,22 +119,18 @@ module.exports.getIno = getIno
/**
* @typedef ReadTextFileOptions
* @property {boolean} [detectEncoding] detect text encoding before decoding
* @property {boolean} [isHtml] use HTML charset hints when detectEncoding is enabled
*/
function detectTextEncoding(buffer, options = {}) {
const { isHtml = false } = options
if (!isHtml) {
return 'UTF-8'
}
function detectTextEncoding(buffer) {
try {
const sniffedEncoding = sniffHTMLEncoding(buffer, { defaultEncoding: 'windows-1252' }) || 'windows-1252'
return whatwgEncoding.labelToName(sniffedEncoding) || 'UTF-8'
} catch {
return 'UTF-8'
}
const detectedEncoding = chardet.detect(buffer)
const labeledEncoding = detectedEncoding ? whatwgEncoding.labelToName(detectedEncoding) : null
if (labeledEncoding) {
return labeledEncoding
}
} catch {}
return 'UTF-8'
}
/**
@ -152,7 +148,7 @@ function decodeTextBuffer(buffer, options = {}) {
return String(buffer)
}
const fallbackEncoding = detectTextEncoding(buffer, { isHtml })
const fallbackEncoding = detectTextEncoding(buffer)
try {
// WHATWG decode handles BOM override and legacy encoding tables.
return whatwgEncoding.decode(buffer, fallbackEncoding)