mirror of
https://github.com/advplyr/audiobookshelf.git
synced 2026-03-01 05:29:41 +00:00
first iteration of parsing metadata and chapter names from ncc.html file
This commit is contained in:
parent
9defe67fe9
commit
f157e63fd7
10 changed files with 394 additions and 6 deletions
|
|
@ -127,7 +127,7 @@ export default {
|
|||
autoScanCronExpression: null,
|
||||
hideSingleBookSeries: false,
|
||||
onlyShowLaterBooksInContinueSeries: false,
|
||||
metadataPrecedence: ['folderStructure', 'audioMetatags', 'nfoFile', 'txtFiles', 'opfFile', 'absMetadata'],
|
||||
metadataPrecedence: ['folderStructure', 'audioMetatags', 'nfoFile', 'txtFiles', 'opfFile', 'daisyFile', 'absMetadata'],
|
||||
markAsFinishedPercentComplete: null,
|
||||
markAsFinishedTimeRemaining: 10
|
||||
}
|
||||
|
|
|
|||
|
|
@ -81,6 +81,11 @@ export default {
|
|||
name: 'OPF file',
|
||||
include: true
|
||||
},
|
||||
daisyFile: {
|
||||
id: 'daisyFile',
|
||||
name: 'DAISY ncc.html file',
|
||||
include: true
|
||||
},
|
||||
absMetadata: {
|
||||
id: 'absMetadata',
|
||||
name: 'Audiobookshelf metadata file',
|
||||
|
|
@ -157,4 +162,4 @@ export default {
|
|||
this.init()
|
||||
}
|
||||
}
|
||||
</script>
|
||||
</script>
|
||||
|
|
|
|||
|
|
@ -82,7 +82,7 @@ class Library extends Model {
|
|||
}
|
||||
|
||||
static get defaultMetadataPrecedence() {
|
||||
return ['folderStructure', 'audioMetatags', 'nfoFile', 'txtFiles', 'opfFile', 'absMetadata']
|
||||
return ['folderStructure', 'audioMetatags', 'nfoFile', 'txtFiles', 'opfFile', 'daisyFile', 'absMetadata']
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -23,6 +23,7 @@ const CoverManager = require('../managers/CoverManager')
|
|||
const LibraryScan = require('./LibraryScan')
|
||||
const OpfFileScanner = require('./OpfFileScanner')
|
||||
const NfoFileScanner = require('./NfoFileScanner')
|
||||
const DaisyFileScanner = require('./DaisyFileScanner')
|
||||
const AbsMetadataFileScanner = require('./AbsMetadataFileScanner')
|
||||
|
||||
/**
|
||||
|
|
@ -792,6 +793,14 @@ class BookScanner {
|
|||
await OpfFileScanner.scanBookOpfFile(this.libraryItemData.metadataOpfLibraryFile, this.bookMetadata)
|
||||
}
|
||||
|
||||
/**
|
||||
* Metadata from DAISY ncc.html file
|
||||
*/
|
||||
async daisyFile() {
|
||||
if (!this.libraryItemData.metadataDaisyNccLibraryFile) return
|
||||
await DaisyFileScanner.scanBookDaisyFile(this.libraryItemData.metadataDaisyNccLibraryFile, this.bookMetadata, this.audioFiles)
|
||||
}
|
||||
|
||||
/**
|
||||
* Metadata from metadata.json
|
||||
*/
|
||||
|
|
|
|||
99
server/scanner/DaisyFileScanner.js
Normal file
99
server/scanner/DaisyFileScanner.js
Normal file
|
|
@ -0,0 +1,99 @@
|
|||
const { parseDaisyMetadata } = require('../utils/parsers/parseDaisyMetadata')
|
||||
const { readTextFile } = require('../utils/fileUtils')
|
||||
const Path = require('path')
|
||||
|
||||
class DaisyFileScanner {
|
||||
constructor() {}
|
||||
|
||||
/**
|
||||
* Parse metadata from DAISY ncc.html file found in library scan and update bookMetadata
|
||||
*
|
||||
* @param {import('../models/LibraryItem').LibraryFileObject} daisyLibraryFileObj
|
||||
* @param {Object} bookMetadata
|
||||
*/
|
||||
async scanBookDaisyFile(daisyLibraryFileObj, bookMetadata, audioFiles = []) {
|
||||
const htmlText = await readTextFile(daisyLibraryFileObj.metadata.path)
|
||||
const daisyMetadata = htmlText ? parseDaisyMetadata(htmlText) : null
|
||||
if (daisyMetadata) {
|
||||
for (const key in daisyMetadata) {
|
||||
if (key === 'tags') {
|
||||
if (daisyMetadata.tags.length) {
|
||||
bookMetadata.tags = daisyMetadata.tags
|
||||
}
|
||||
} else if (key === 'genres') {
|
||||
if (daisyMetadata.genres.length) {
|
||||
bookMetadata.genres = daisyMetadata.genres
|
||||
}
|
||||
} else if (key === 'authors') {
|
||||
if (daisyMetadata.authors?.length) {
|
||||
bookMetadata.authors = daisyMetadata.authors
|
||||
}
|
||||
} else if (key === 'narrators') {
|
||||
if (daisyMetadata.narrators?.length) {
|
||||
bookMetadata.narrators = daisyMetadata.narrators
|
||||
}
|
||||
} else if (key === 'chapters') {
|
||||
if (!daisyMetadata.chapters?.length) continue
|
||||
|
||||
// DAISY ncc.html provides chapter names; preserve existing timings if available.
|
||||
if (bookMetadata.chapters?.length) {
|
||||
const updatedChapters = bookMetadata.chapters.map((chapter, index) => {
|
||||
const daisyChapter = daisyMetadata.chapters[index]
|
||||
if (!daisyChapter?.title) return chapter
|
||||
return {
|
||||
...chapter,
|
||||
id: chapter.id ?? index,
|
||||
title: daisyChapter.title
|
||||
}
|
||||
})
|
||||
bookMetadata.chapters = updatedChapters
|
||||
} else {
|
||||
const chaptersFromFiles = this.buildChaptersFromAudioFiles(audioFiles, daisyMetadata.chapters)
|
||||
if (chaptersFromFiles.length) {
|
||||
bookMetadata.chapters = chaptersFromFiles
|
||||
}
|
||||
}
|
||||
} else if (daisyMetadata[key]) {
|
||||
bookMetadata[key] = daisyMetadata[key]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Build chapter timings from ordered audio files while applying DAISY chapter titles.
|
||||
* Falls back to file basenames if DAISY has fewer titles than files.
|
||||
*
|
||||
* @param {import('../models/Book').AudioFileObject[]} audioFiles
|
||||
* @param {{title:string}[]} daisyChapters
|
||||
* @returns {import('../models/Book').ChapterObject[]}
|
||||
*/
|
||||
buildChaptersFromAudioFiles(audioFiles, daisyChapters) {
|
||||
if (!audioFiles?.length) return []
|
||||
|
||||
const chapters = []
|
||||
let currentStartTime = 0
|
||||
let chapterId = 0
|
||||
|
||||
audioFiles.forEach((audioFile) => {
|
||||
if (!audioFile.duration) return
|
||||
|
||||
const fallbackTitle = audioFile.metadata?.filename
|
||||
? Path.basename(audioFile.metadata.filename, Path.extname(audioFile.metadata.filename))
|
||||
: `Chapter ${chapterId + 1}`
|
||||
const title = daisyChapters[chapterId]?.title || fallbackTitle
|
||||
|
||||
chapters.push({
|
||||
id: chapterId++,
|
||||
start: currentStartTime,
|
||||
end: currentStartTime + audioFile.duration,
|
||||
title
|
||||
})
|
||||
|
||||
currentStartTime += audioFile.duration
|
||||
})
|
||||
|
||||
return chapters
|
||||
}
|
||||
}
|
||||
module.exports = new DaisyFileScanner()
|
||||
|
|
@ -173,6 +173,11 @@ class LibraryItemScanData {
|
|||
return this.libraryFiles.find(lf => lf.metadata.ext.toLowerCase() === '.nfo')
|
||||
}
|
||||
|
||||
/** @type {LibraryItem.LibraryFileObject} */
|
||||
get metadataDaisyNccLibraryFile() {
|
||||
return this.libraryFiles.find(lf => lf.metadata.filename?.toLowerCase() === 'ncc.html')
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param {LibraryItem} existingLibraryItem
|
||||
|
|
@ -374,4 +379,4 @@ class LibraryItemScanData {
|
|||
}
|
||||
}
|
||||
}
|
||||
module.exports = LibraryItemScanData
|
||||
module.exports = LibraryItemScanData
|
||||
|
|
|
|||
176
server/utils/parsers/parseDaisyMetadata.js
Normal file
176
server/utils/parsers/parseDaisyMetadata.js
Normal file
|
|
@ -0,0 +1,176 @@
|
|||
const h = require('htmlparser2')
|
||||
const parseNameString = require('./parseNameString')
|
||||
|
||||
function getValues(metaTags, tagName) {
|
||||
return metaTags[tagName]?.filter((v) => v) || []
|
||||
}
|
||||
|
||||
function getFirstValue(metaTags, tagNames) {
|
||||
for (const tagName of tagNames) {
|
||||
const values = getValues(metaTags, tagName)
|
||||
if (values.length) return values[0]
|
||||
}
|
||||
return null
|
||||
}
|
||||
|
||||
function parseNameValues(values) {
|
||||
const names = []
|
||||
values.forEach((value) => {
|
||||
const parsedNames = parseNameString.parse(value)?.names || value.split(/\s*;\s*/).filter((n) => n)
|
||||
parsedNames.forEach((name) => {
|
||||
if (!names.includes(name)) names.push(name)
|
||||
})
|
||||
})
|
||||
return names
|
||||
}
|
||||
|
||||
function parseStringList(values) {
|
||||
const items = []
|
||||
values.forEach((value) => {
|
||||
value.split(/\s*[;,]\s*/).forEach((item) => {
|
||||
if (item && !items.includes(item)) {
|
||||
items.push(item)
|
||||
}
|
||||
})
|
||||
})
|
||||
return items
|
||||
}
|
||||
|
||||
function extractYear(str) {
|
||||
if (!str) return null
|
||||
const match = str.match(/\d{4}/)
|
||||
return match ? match[0] : null
|
||||
}
|
||||
|
||||
function extractIdentifierValue(identifier, identifierType) {
|
||||
if (!identifier) return null
|
||||
|
||||
const value = identifier.trim()
|
||||
const expression = identifierType === 'isbn'
|
||||
? /(?:^|[^a-z0-9])(97[89][\d\- ]{9,16}[\dx]|[\d\- ]{9,14}[\dx])(?:$|[^a-z0-9])/i
|
||||
: /(?:^|[^a-z0-9])([a-z0-9]{10})(?:$|[^a-z0-9])/i
|
||||
|
||||
const match = value.match(expression)
|
||||
if (!match) return null
|
||||
return (match[1] || match[0]).replace(/^[^a-z0-9]+|[^a-z0-9]+$/gi, '').trim()
|
||||
}
|
||||
|
||||
function parseIdentifier(metaTags, identifierType) {
|
||||
const typeTag = identifierType === 'isbn' ? 'dc:identifier:isbn' : 'dc:identifier:asin'
|
||||
const typedIdentifier = getFirstValue(metaTags, [typeTag, identifierType])
|
||||
if (typedIdentifier) {
|
||||
const extracted = extractIdentifierValue(typedIdentifier, identifierType)
|
||||
if (extracted) return extracted
|
||||
}
|
||||
|
||||
const identifierValues = [
|
||||
...getValues(metaTags, 'dc:identifier'),
|
||||
...getValues(metaTags, 'ncc:identifier')
|
||||
]
|
||||
for (const identifier of identifierValues) {
|
||||
if (identifierType === 'isbn' && /isbn/i.test(identifier)) {
|
||||
const extracted = extractIdentifierValue(identifier, identifierType)
|
||||
if (extracted) return extracted
|
||||
}
|
||||
if (identifierType === 'asin' && /asin/i.test(identifier)) {
|
||||
const extracted = extractIdentifierValue(identifier, identifierType)
|
||||
if (extracted) return extracted
|
||||
}
|
||||
}
|
||||
|
||||
for (const identifier of identifierValues) {
|
||||
const extracted = extractIdentifierValue(identifier, identifierType)
|
||||
if (extracted) return extracted
|
||||
}
|
||||
return null
|
||||
}
|
||||
|
||||
function parseDaisyMetadata(htmlText) {
|
||||
if (!htmlText) return null
|
||||
|
||||
const metaTags = {}
|
||||
let titleText = ''
|
||||
let inTitle = false
|
||||
let currentHeadingName = null
|
||||
let currentHeadingText = ''
|
||||
const chapterTitles = []
|
||||
|
||||
const parser = new h.Parser(
|
||||
{
|
||||
onopentag: (name, attribs) => {
|
||||
if (name === 'title') {
|
||||
inTitle = true
|
||||
}
|
||||
if (/^h[1-6]$/.test(name)) {
|
||||
currentHeadingName = name
|
||||
currentHeadingText = ''
|
||||
}
|
||||
if (name !== 'meta') return
|
||||
|
||||
const tagName = attribs.name?.trim().toLowerCase()
|
||||
const content = attribs.content?.trim()
|
||||
if (!tagName || !content) return
|
||||
|
||||
if (!metaTags[tagName]) metaTags[tagName] = []
|
||||
metaTags[tagName].push(content)
|
||||
},
|
||||
ontext: (text) => {
|
||||
if (inTitle) titleText += text
|
||||
if (currentHeadingName) currentHeadingText += text
|
||||
},
|
||||
onclosetag: (name) => {
|
||||
if (name === 'title') {
|
||||
inTitle = false
|
||||
}
|
||||
if (name === currentHeadingName) {
|
||||
const chapterTitle = currentHeadingText.replace(/\s+/g, ' ').trim()
|
||||
if (chapterTitle) {
|
||||
chapterTitles.push(chapterTitle)
|
||||
}
|
||||
currentHeadingName = null
|
||||
currentHeadingText = ''
|
||||
}
|
||||
}
|
||||
},
|
||||
{ decodeEntities: true }
|
||||
)
|
||||
|
||||
parser.write(htmlText)
|
||||
parser.end()
|
||||
|
||||
const creators = parseNameValues(getValues(metaTags, 'dc:creator'))
|
||||
const narrators = parseNameValues(getValues(metaTags, 'ncc:narrator'))
|
||||
const subjects = parseStringList([
|
||||
...getValues(metaTags, 'dc:subject'),
|
||||
...getValues(metaTags, 'ncc:subject')
|
||||
])
|
||||
const tags = parseStringList([
|
||||
...getValues(metaTags, 'ncc:keywords'),
|
||||
...getValues(metaTags, 'dc:tag')
|
||||
])
|
||||
|
||||
const metadata = {
|
||||
title: getFirstValue(metaTags, ['dc:title']) || titleText.trim() || null,
|
||||
authors: creators,
|
||||
narrators,
|
||||
publishedYear: extractYear(getFirstValue(metaTags, ['dc:date', 'ncc:revisiondate'])),
|
||||
publisher: getFirstValue(metaTags, ['dc:publisher']),
|
||||
description: getFirstValue(metaTags, ['dc:description']),
|
||||
language: getFirstValue(metaTags, ['dc:language']),
|
||||
genres: subjects,
|
||||
tags,
|
||||
isbn: parseIdentifier(metaTags, 'isbn'),
|
||||
asin: parseIdentifier(metaTags, 'asin'),
|
||||
chapters: chapterTitles.map((title) => ({ title }))
|
||||
}
|
||||
|
||||
for (const key in metadata) {
|
||||
if (metadata[key] === null) {
|
||||
delete metadata[key]
|
||||
}
|
||||
}
|
||||
|
||||
return metadata
|
||||
}
|
||||
|
||||
module.exports = { parseDaisyMetadata }
|
||||
|
|
@ -24,7 +24,10 @@ function isMediaFile(mediaType, ext, audiobooksOnly = false) {
|
|||
return globals.SupportedAudioTypes.includes(extclean) || globals.SupportedEbookTypes.includes(extclean)
|
||||
}
|
||||
|
||||
function isScannableNonMediaFile(ext) {
|
||||
function isScannableNonMediaFile(ext, filename = '') {
|
||||
const filenameLower = filename.toLowerCase()
|
||||
if (filenameLower === 'ncc.html') return true
|
||||
|
||||
if (!ext) return false
|
||||
const extclean = ext.slice(1).toLowerCase()
|
||||
return globals.TextFileTypes.includes(extclean) || globals.MetadataFileTypes.includes(extclean) || globals.SupportedImageTypes.includes(extclean)
|
||||
|
|
@ -58,7 +61,7 @@ function groupFileItemsIntoLibraryItemDirs(mediaType, fileItems, audiobooksOnly,
|
|||
/** @type {import('./fileUtils').FilePathItem[]} */
|
||||
const otherFileItems = []
|
||||
itemsFiltered.forEach((item) => {
|
||||
if (isMediaFile(mediaType, item.extension, audiobooksOnly) || (includeNonMediaFiles && isScannableNonMediaFile(item.extension))) {
|
||||
if (isMediaFile(mediaType, item.extension, audiobooksOnly) || (includeNonMediaFiles && isScannableNonMediaFile(item.extension, item.name))) {
|
||||
mediaFileItems.push(item)
|
||||
} else {
|
||||
otherFileItems.push(item)
|
||||
|
|
|
|||
73
test/server/utils/parsers/parseDaisyMetadata.test.js
Normal file
73
test/server/utils/parsers/parseDaisyMetadata.test.js
Normal file
|
|
@ -0,0 +1,73 @@
|
|||
const chai = require('chai')
|
||||
const expect = chai.expect
|
||||
const { parseDaisyMetadata } = require('../../../../server/utils/parsers/parseDaisyMetadata')
|
||||
|
||||
describe('parseDaisyMetadata', () => {
|
||||
it('returns null if htmlText is empty', () => {
|
||||
const result = parseDaisyMetadata('')
|
||||
expect(result).to.be.null
|
||||
})
|
||||
|
||||
it('parses common metadata values from DAISY ncc.html', () => {
|
||||
const nccHtml = `
|
||||
<html>
|
||||
<head>
|
||||
<title>Fallback Title</title>
|
||||
<meta name="dc:title" content="The DAISY Book">
|
||||
<meta name="dc:creator" content="Jane Doe & Richard Roe">
|
||||
<meta name="ncc:narrator" content="Reader One; Reader Two">
|
||||
<meta name="dc:publisher" content="Talking Books Inc">
|
||||
<meta name="dc:date" content="2021-06-04">
|
||||
<meta name="dc:language" content="en">
|
||||
<meta name="dc:subject" content="Fiction, Mystery">
|
||||
<meta name="ncc:keywords" content="audio; daisy">
|
||||
<meta name="dc:identifier" content="ISBN 978-1-4028-9462-6">
|
||||
<meta name="dc:identifier:asin" content="ASIN: B012345678">
|
||||
</head>
|
||||
</html>
|
||||
`
|
||||
|
||||
const result = parseDaisyMetadata(nccHtml)
|
||||
expect(result.title).to.equal('The DAISY Book')
|
||||
expect(result.authors).to.deep.equal(['Jane Doe', 'Richard Roe'])
|
||||
expect(result.narrators).to.deep.equal(['Reader One', 'Reader Two'])
|
||||
expect(result.publisher).to.equal('Talking Books Inc')
|
||||
expect(result.publishedYear).to.equal('2021')
|
||||
expect(result.language).to.equal('en')
|
||||
expect(result.genres).to.deep.equal(['Fiction', 'Mystery'])
|
||||
expect(result.tags).to.deep.equal(['audio', 'daisy'])
|
||||
expect(result.isbn).to.equal('978-1-4028-9462-6')
|
||||
expect(result.asin).to.equal('B012345678')
|
||||
})
|
||||
|
||||
it('falls back to title tag when dc:title is not set', () => {
|
||||
const nccHtml = `
|
||||
<html>
|
||||
<head>
|
||||
<title>Title From Head</title>
|
||||
</head>
|
||||
</html>
|
||||
`
|
||||
const result = parseDaisyMetadata(nccHtml)
|
||||
expect(result.title).to.equal('Title From Head')
|
||||
})
|
||||
|
||||
it('parses chapter names from heading entries in ncc.html', () => {
|
||||
const nccHtml = `
|
||||
<html>
|
||||
<body>
|
||||
<h1><a href="book.smil#id1">Chapter 1</a></h1>
|
||||
<h2><a href="book.smil#id2">Chapter 2: The Road</a></h2>
|
||||
<h3>Part 1</h3>
|
||||
</body>
|
||||
</html>
|
||||
`
|
||||
|
||||
const result = parseDaisyMetadata(nccHtml)
|
||||
expect(result.chapters).to.deep.equal([
|
||||
{ title: 'Chapter 1' },
|
||||
{ title: 'Chapter 2: The Road' },
|
||||
{ title: 'Part 1' }
|
||||
])
|
||||
})
|
||||
})
|
||||
|
|
@ -49,4 +49,22 @@ describe('scanUtils', async () => {
|
|||
'Author/Series2/Book5/deeply/nested': ['cd 01/audiofile.mp3', 'cd 02/audiofile.mp3']
|
||||
})
|
||||
})
|
||||
|
||||
it('should include DAISY ncc.html changes when includeNonMediaFiles is enabled', async () => {
|
||||
const filePath = 'Author/Book3/ncc.html'
|
||||
const dirname = Path.dirname(filePath)
|
||||
const fileItems = [
|
||||
{
|
||||
name: Path.basename(filePath),
|
||||
reldirpath: dirname === '.' ? '' : dirname,
|
||||
extension: Path.extname(filePath),
|
||||
deep: filePath.split('/').length - 1
|
||||
}
|
||||
]
|
||||
|
||||
const libraryItemGrouping = scanUtils.groupFileItemsIntoLibraryItemDirs('book', fileItems, false, true)
|
||||
expect(libraryItemGrouping).to.deep.equal({
|
||||
'Author/Book3': ['ncc.html']
|
||||
})
|
||||
})
|
||||
})
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue