mirror of
https://github.com/advplyr/audiobookshelf.git
synced 2026-03-06 16:09:46 +00:00
try to properly interpret ncc.html encoding (seems to be a bit weird / incorrect sometimes)
This commit is contained in:
parent
6c9bf8c2bd
commit
687e62e1fa
5 changed files with 132 additions and 5 deletions
57
package-lock.json
generated
57
package-lock.json
generated
|
|
@ -15,6 +15,7 @@
|
||||||
"express-rate-limit": "^7.5.1",
|
"express-rate-limit": "^7.5.1",
|
||||||
"express-session": "^1.17.3",
|
"express-session": "^1.17.3",
|
||||||
"graceful-fs": "^4.2.10",
|
"graceful-fs": "^4.2.10",
|
||||||
|
"html-encoding-sniffer": "^4.0.0",
|
||||||
"htmlparser2": "^8.0.1",
|
"htmlparser2": "^8.0.1",
|
||||||
"lru-cache": "^10.0.3",
|
"lru-cache": "^10.0.3",
|
||||||
"node-unrar-js": "^2.0.2",
|
"node-unrar-js": "^2.0.2",
|
||||||
|
|
@ -28,6 +29,7 @@
|
||||||
"socket.io": "^4.5.4",
|
"socket.io": "^4.5.4",
|
||||||
"sqlite3": "^5.1.7",
|
"sqlite3": "^5.1.7",
|
||||||
"ssrf-req-filter": "^1.1.0",
|
"ssrf-req-filter": "^1.1.0",
|
||||||
|
"whatwg-encoding": "^3.1.1",
|
||||||
"xml2js": "^0.5.0"
|
"xml2js": "^0.5.0"
|
||||||
},
|
},
|
||||||
"bin": {
|
"bin": {
|
||||||
|
|
@ -122,6 +124,7 @@
|
||||||
"resolved": "https://registry.npmjs.org/@babel/core/-/core-7.23.3.tgz",
|
"resolved": "https://registry.npmjs.org/@babel/core/-/core-7.23.3.tgz",
|
||||||
"integrity": "sha512-Jg+msLuNuCJDyBvFv5+OKOUjWMZgd85bKjbICd3zWrKAo+bJ49HJufi7CQE0q0uR8NGyO6xkCACScNqyjHSZew==",
|
"integrity": "sha512-Jg+msLuNuCJDyBvFv5+OKOUjWMZgd85bKjbICd3zWrKAo+bJ49HJufi7CQE0q0uR8NGyO6xkCACScNqyjHSZew==",
|
||||||
"dev": true,
|
"dev": true,
|
||||||
|
"peer": true,
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@ampproject/remapping": "^2.2.0",
|
"@ampproject/remapping": "^2.2.0",
|
||||||
"@babel/code-frame": "^7.22.13",
|
"@babel/code-frame": "^7.22.13",
|
||||||
|
|
@ -1049,6 +1052,7 @@
|
||||||
"url": "https://github.com/sponsors/ai"
|
"url": "https://github.com/sponsors/ai"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
"peer": true,
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"caniuse-lite": "^1.0.30001541",
|
"caniuse-lite": "^1.0.30001541",
|
||||||
"electron-to-chromium": "^1.4.535",
|
"electron-to-chromium": "^1.4.535",
|
||||||
|
|
@ -1857,6 +1861,7 @@
|
||||||
"version": "4.18.2",
|
"version": "4.18.2",
|
||||||
"resolved": "https://registry.npmjs.org/express/-/express-4.18.2.tgz",
|
"resolved": "https://registry.npmjs.org/express/-/express-4.18.2.tgz",
|
||||||
"integrity": "sha512-5/PsL6iGPdfQ/lKM1UuielYgv3BUoJfz1aUwU9vHZ+J7gyvwdQXFEBIEIaxeGf0GIcreATNyBExtalisDbuMqQ==",
|
"integrity": "sha512-5/PsL6iGPdfQ/lKM1UuielYgv3BUoJfz1aUwU9vHZ+J7gyvwdQXFEBIEIaxeGf0GIcreATNyBExtalisDbuMqQ==",
|
||||||
|
"peer": true,
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"accepts": "~1.3.8",
|
"accepts": "~1.3.8",
|
||||||
"array-flatten": "1.1.1",
|
"array-flatten": "1.1.1",
|
||||||
|
|
@ -2113,6 +2118,21 @@
|
||||||
"integrity": "sha512-OO0pH2lK6a0hZnAdau5ItzHPI6pUlvI7jMVnxUQRtw4owF2wk8lOSabtGDCTP4Ggrg2MbGnWO9X8K1t4+fGMDw==",
|
"integrity": "sha512-OO0pH2lK6a0hZnAdau5ItzHPI6pUlvI7jMVnxUQRtw4owF2wk8lOSabtGDCTP4Ggrg2MbGnWO9X8K1t4+fGMDw==",
|
||||||
"devOptional": true
|
"devOptional": true
|
||||||
},
|
},
|
||||||
|
"node_modules/fsevents": {
|
||||||
|
"version": "2.3.3",
|
||||||
|
"resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.3.tgz",
|
||||||
|
"integrity": "sha512-5xoDfX+fL7faATnagmWPpbFtwh/R77WmMMqqHGS65C3vvB0YHrgF+B1YmZ3441tMj5n63k0212XNoJwzlhffQw==",
|
||||||
|
"dev": true,
|
||||||
|
"hasInstallScript": true,
|
||||||
|
"license": "MIT",
|
||||||
|
"optional": true,
|
||||||
|
"os": [
|
||||||
|
"darwin"
|
||||||
|
],
|
||||||
|
"engines": {
|
||||||
|
"node": "^8.16.0 || ^10.6.0 || >=11.0.0"
|
||||||
|
}
|
||||||
|
},
|
||||||
"node_modules/function-bind": {
|
"node_modules/function-bind": {
|
||||||
"version": "1.1.2",
|
"version": "1.1.2",
|
||||||
"resolved": "https://registry.npmjs.org/function-bind/-/function-bind-1.1.2.tgz",
|
"resolved": "https://registry.npmjs.org/function-bind/-/function-bind-1.1.2.tgz",
|
||||||
|
|
@ -2284,6 +2304,18 @@
|
||||||
"he": "bin/he"
|
"he": "bin/he"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"node_modules/html-encoding-sniffer": {
|
||||||
|
"version": "4.0.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/html-encoding-sniffer/-/html-encoding-sniffer-4.0.0.tgz",
|
||||||
|
"integrity": "sha512-Y22oTqIU4uuPgEemfz7NDJz6OeKf12Lsu+QC+s3BVpda64lTiMYCyGwg5ki4vFxkMwQdeZDl2adZoqUgdFuTgQ==",
|
||||||
|
"license": "MIT",
|
||||||
|
"dependencies": {
|
||||||
|
"whatwg-encoding": "^3.1.1"
|
||||||
|
},
|
||||||
|
"engines": {
|
||||||
|
"node": ">=18"
|
||||||
|
}
|
||||||
|
},
|
||||||
"node_modules/html-escaper": {
|
"node_modules/html-escaper": {
|
||||||
"version": "2.0.2",
|
"version": "2.0.2",
|
||||||
"resolved": "https://registry.npmjs.org/html-escaper/-/html-escaper-2.0.2.tgz",
|
"resolved": "https://registry.npmjs.org/html-escaper/-/html-escaper-2.0.2.tgz",
|
||||||
|
|
@ -5358,6 +5390,31 @@
|
||||||
"node": ">= 0.8"
|
"node": ">= 0.8"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"node_modules/whatwg-encoding": {
|
||||||
|
"version": "3.1.1",
|
||||||
|
"resolved": "https://registry.npmjs.org/whatwg-encoding/-/whatwg-encoding-3.1.1.tgz",
|
||||||
|
"integrity": "sha512-6qN4hJdMwfYBtE3YBTTHhoeuUrDBPZmbQaxWAqSALV/MeEnR5z1xd8UKud2RAkFoPkmB+hli1TZSnyi84xz1vQ==",
|
||||||
|
"deprecated": "Use @exodus/bytes instead for a more spec-conformant and faster implementation",
|
||||||
|
"license": "MIT",
|
||||||
|
"dependencies": {
|
||||||
|
"iconv-lite": "0.6.3"
|
||||||
|
},
|
||||||
|
"engines": {
|
||||||
|
"node": ">=18"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/whatwg-encoding/node_modules/iconv-lite": {
|
||||||
|
"version": "0.6.3",
|
||||||
|
"resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.6.3.tgz",
|
||||||
|
"integrity": "sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw==",
|
||||||
|
"license": "MIT",
|
||||||
|
"dependencies": {
|
||||||
|
"safer-buffer": ">= 2.1.2 < 3.0.0"
|
||||||
|
},
|
||||||
|
"engines": {
|
||||||
|
"node": ">=0.10.0"
|
||||||
|
}
|
||||||
|
},
|
||||||
"node_modules/which": {
|
"node_modules/which": {
|
||||||
"version": "2.0.2",
|
"version": "2.0.2",
|
||||||
"resolved": "https://registry.npmjs.org/which/-/which-2.0.2.tgz",
|
"resolved": "https://registry.npmjs.org/which/-/which-2.0.2.tgz",
|
||||||
|
|
|
||||||
|
|
@ -44,6 +44,7 @@
|
||||||
"express-rate-limit": "^7.5.1",
|
"express-rate-limit": "^7.5.1",
|
||||||
"express-session": "^1.17.3",
|
"express-session": "^1.17.3",
|
||||||
"graceful-fs": "^4.2.10",
|
"graceful-fs": "^4.2.10",
|
||||||
|
"html-encoding-sniffer": "^4.0.0",
|
||||||
"htmlparser2": "^8.0.1",
|
"htmlparser2": "^8.0.1",
|
||||||
"lru-cache": "^10.0.3",
|
"lru-cache": "^10.0.3",
|
||||||
"node-unrar-js": "^2.0.2",
|
"node-unrar-js": "^2.0.2",
|
||||||
|
|
@ -57,6 +58,7 @@
|
||||||
"socket.io": "^4.5.4",
|
"socket.io": "^4.5.4",
|
||||||
"sqlite3": "^5.1.7",
|
"sqlite3": "^5.1.7",
|
||||||
"ssrf-req-filter": "^1.1.0",
|
"ssrf-req-filter": "^1.1.0",
|
||||||
|
"whatwg-encoding": "^3.1.1",
|
||||||
"xml2js": "^0.5.0"
|
"xml2js": "^0.5.0"
|
||||||
},
|
},
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
|
|
|
||||||
|
|
@ -12,7 +12,7 @@ class DaisyFileScanner {
|
||||||
* @param {Object} bookMetadata
|
* @param {Object} bookMetadata
|
||||||
*/
|
*/
|
||||||
async scanBookDaisyFile(daisyLibraryFileObj, bookMetadata, audioFiles = []) {
|
async scanBookDaisyFile(daisyLibraryFileObj, bookMetadata, audioFiles = []) {
|
||||||
const htmlText = await readTextFile(daisyLibraryFileObj.metadata.path)
|
const htmlText = await readTextFile(daisyLibraryFileObj.metadata.path, { detectEncoding: true, isHtml: true })
|
||||||
const daisyMetadata = htmlText ? parseDaisyMetadata(htmlText) : null
|
const daisyMetadata = htmlText ? parseDaisyMetadata(htmlText) : null
|
||||||
if (daisyMetadata) {
|
if (daisyMetadata) {
|
||||||
for (const key in daisyMetadata) {
|
for (const key in daisyMetadata) {
|
||||||
|
|
|
||||||
|
|
@ -6,6 +6,8 @@ const fs = require('../libs/fsExtra')
|
||||||
const rra = require('../libs/recursiveReaddirAsync')
|
const rra = require('../libs/recursiveReaddirAsync')
|
||||||
const Logger = require('../Logger')
|
const Logger = require('../Logger')
|
||||||
const { AudioMimeType } = require('./constants')
|
const { AudioMimeType } = require('./constants')
|
||||||
|
const sniffHTMLEncoding = require('html-encoding-sniffer')
|
||||||
|
const whatwgEncoding = require('whatwg-encoding')
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Make sure folder separator is POSIX for Windows file paths. e.g. "C:\Users\Abs" becomes "C:/Users/Abs"
|
* Make sure folder separator is POSIX for Windows file paths. e.g. "C:\Users\Abs" becomes "C:/Users/Abs"
|
||||||
|
|
@ -116,14 +118,60 @@ function getIno(path) {
|
||||||
module.exports.getIno = getIno
|
module.exports.getIno = getIno
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Read contents of file
|
* @typedef ReadTextFileOptions
|
||||||
* @param {string} path
|
* @property {boolean} [detectEncoding] detect text encoding before decoding
|
||||||
|
* @property {boolean} [isHtml] use HTML charset hints when detectEncoding is enabled
|
||||||
|
*/
|
||||||
|
|
||||||
|
function detectTextEncoding(buffer, options = {}) {
|
||||||
|
const { isHtml = false } = options
|
||||||
|
if (!isHtml) {
|
||||||
|
return 'UTF-8'
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
const sniffedEncoding = sniffHTMLEncoding(buffer, { defaultEncoding: 'windows-1252' }) || 'windows-1252'
|
||||||
|
return whatwgEncoding.labelToName(sniffedEncoding) || 'UTF-8'
|
||||||
|
} catch {
|
||||||
|
return 'UTF-8'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Decode raw text bytes with optional encoding detection.
|
||||||
|
*
|
||||||
|
* @param {Buffer} buffer
|
||||||
|
* @param {ReadTextFileOptions} [options]
|
||||||
* @returns {string}
|
* @returns {string}
|
||||||
*/
|
*/
|
||||||
async function readTextFile(path) {
|
function decodeTextBuffer(buffer, options = {}) {
|
||||||
|
if (!buffer) return ''
|
||||||
|
const { detectEncoding = false, isHtml = false } = options
|
||||||
|
|
||||||
|
if (!detectEncoding) {
|
||||||
|
return String(buffer)
|
||||||
|
}
|
||||||
|
|
||||||
|
const fallbackEncoding = detectTextEncoding(buffer, { isHtml })
|
||||||
|
try {
|
||||||
|
// WHATWG decode handles BOM override and legacy encoding tables.
|
||||||
|
return whatwgEncoding.decode(buffer, fallbackEncoding)
|
||||||
|
} catch {
|
||||||
|
return String(buffer)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
module.exports.decodeTextBuffer = decodeTextBuffer
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Read contents of file
|
||||||
|
* @param {string} path
|
||||||
|
* @param {ReadTextFileOptions} [options]
|
||||||
|
* @returns {string}
|
||||||
|
*/
|
||||||
|
async function readTextFile(path, options = {}) {
|
||||||
try {
|
try {
|
||||||
var data = await fs.readFile(path)
|
var data = await fs.readFile(path)
|
||||||
return String(data)
|
return decodeTextBuffer(data, options)
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
Logger.error(`[FileUtils] ReadTextFile error ${error}`)
|
Logger.error(`[FileUtils] ReadTextFile error ${error}`)
|
||||||
return ''
|
return ''
|
||||||
|
|
|
||||||
|
|
@ -6,6 +6,26 @@ const fs = require('fs')
|
||||||
const Logger = require('../../../server/Logger')
|
const Logger = require('../../../server/Logger')
|
||||||
|
|
||||||
describe('fileUtils', () => {
|
describe('fileUtils', () => {
|
||||||
|
describe('decodeTextBuffer', () => {
|
||||||
|
it('decodes html using charset declaration (windows-1252)', () => {
|
||||||
|
const htmlPrefix = Buffer.from('<html><head><meta charset="windows-1252"></head><body>M')
|
||||||
|
const htmlSuffix = Buffer.from('ller</body></html>')
|
||||||
|
const input = Buffer.concat([htmlPrefix, Buffer.from([0xfc]), htmlSuffix])
|
||||||
|
|
||||||
|
const decoded = fileUtils.decodeTextBuffer(input, { detectEncoding: true, isHtml: true })
|
||||||
|
expect(decoded).to.include('Müller')
|
||||||
|
})
|
||||||
|
|
||||||
|
it('falls back to windows-1252 for html without charset when utf-8 decoding is invalid', () => {
|
||||||
|
const htmlPrefix = Buffer.from('<html><body>Gr')
|
||||||
|
const htmlSuffix = Buffer.from('n</body></html>')
|
||||||
|
const input = Buffer.concat([htmlPrefix, Buffer.from([0xfc]), htmlSuffix])
|
||||||
|
|
||||||
|
const decoded = fileUtils.decodeTextBuffer(input, { detectEncoding: true, isHtml: true })
|
||||||
|
expect(decoded).to.include('Grün')
|
||||||
|
})
|
||||||
|
})
|
||||||
|
|
||||||
it('shouldIgnoreFile', () => {
|
it('shouldIgnoreFile', () => {
|
||||||
global.isWin = process.platform === 'win32'
|
global.isWin = process.platform === 'win32'
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue