try to properly interpret ncc.html encoding (seems to be a bit weird / incorrect sometimes)

This commit is contained in:
Toni Barth 2026-02-07 18:00:29 +01:00
parent 6c9bf8c2bd
commit 687e62e1fa
5 changed files with 132 additions and 5 deletions

View file

@ -6,6 +6,26 @@ const fs = require('fs')
const Logger = require('../../../server/Logger')
describe('fileUtils', () => {
describe('decodeTextBuffer', () => {
it('decodes html using charset declaration (windows-1252)', () => {
const htmlPrefix = Buffer.from('<html><head><meta charset="windows-1252"></head><body>M')
const htmlSuffix = Buffer.from('ller</body></html>')
const input = Buffer.concat([htmlPrefix, Buffer.from([0xfc]), htmlSuffix])
const decoded = fileUtils.decodeTextBuffer(input, { detectEncoding: true, isHtml: true })
expect(decoded).to.include('Müller')
})
it('falls back to windows-1252 for html without charset when utf-8 decoding is invalid', () => {
const htmlPrefix = Buffer.from('<html><body>Gr')
const htmlSuffix = Buffer.from('n</body></html>')
const input = Buffer.concat([htmlPrefix, Buffer.from([0xfc]), htmlSuffix])
const decoded = fileUtils.decodeTextBuffer(input, { detectEncoding: true, isHtml: true })
expect(decoded).to.include('Grün')
})
})
it('shouldIgnoreFile', () => {
global.isWin = process.platform === 'win32'