mirror of
https://github.com/advplyr/audiobookshelf.git
synced 2026-03-01 05:29:41 +00:00
first iteration of parsing metadata and chapter names from ncc.html file
This commit is contained in:
parent
fe13456a2b
commit
6c9bf8c2bd
10 changed files with 394 additions and 6 deletions
176
server/utils/parsers/parseDaisyMetadata.js
Normal file
176
server/utils/parsers/parseDaisyMetadata.js
Normal file
|
|
@ -0,0 +1,176 @@
|
|||
const h = require('htmlparser2')
|
||||
const parseNameString = require('./parseNameString')
|
||||
|
||||
function getValues(metaTags, tagName) {
|
||||
return metaTags[tagName]?.filter((v) => v) || []
|
||||
}
|
||||
|
||||
function getFirstValue(metaTags, tagNames) {
|
||||
for (const tagName of tagNames) {
|
||||
const values = getValues(metaTags, tagName)
|
||||
if (values.length) return values[0]
|
||||
}
|
||||
return null
|
||||
}
|
||||
|
||||
function parseNameValues(values) {
|
||||
const names = []
|
||||
values.forEach((value) => {
|
||||
const parsedNames = parseNameString.parse(value)?.names || value.split(/\s*;\s*/).filter((n) => n)
|
||||
parsedNames.forEach((name) => {
|
||||
if (!names.includes(name)) names.push(name)
|
||||
})
|
||||
})
|
||||
return names
|
||||
}
|
||||
|
||||
function parseStringList(values) {
|
||||
const items = []
|
||||
values.forEach((value) => {
|
||||
value.split(/\s*[;,]\s*/).forEach((item) => {
|
||||
if (item && !items.includes(item)) {
|
||||
items.push(item)
|
||||
}
|
||||
})
|
||||
})
|
||||
return items
|
||||
}
|
||||
|
||||
function extractYear(str) {
|
||||
if (!str) return null
|
||||
const match = str.match(/\d{4}/)
|
||||
return match ? match[0] : null
|
||||
}
|
||||
|
||||
function extractIdentifierValue(identifier, identifierType) {
|
||||
if (!identifier) return null
|
||||
|
||||
const value = identifier.trim()
|
||||
const expression = identifierType === 'isbn'
|
||||
? /(?:^|[^a-z0-9])(97[89][\d\- ]{9,16}[\dx]|[\d\- ]{9,14}[\dx])(?:$|[^a-z0-9])/i
|
||||
: /(?:^|[^a-z0-9])([a-z0-9]{10})(?:$|[^a-z0-9])/i
|
||||
|
||||
const match = value.match(expression)
|
||||
if (!match) return null
|
||||
return (match[1] || match[0]).replace(/^[^a-z0-9]+|[^a-z0-9]+$/gi, '').trim()
|
||||
}
|
||||
|
||||
function parseIdentifier(metaTags, identifierType) {
|
||||
const typeTag = identifierType === 'isbn' ? 'dc:identifier:isbn' : 'dc:identifier:asin'
|
||||
const typedIdentifier = getFirstValue(metaTags, [typeTag, identifierType])
|
||||
if (typedIdentifier) {
|
||||
const extracted = extractIdentifierValue(typedIdentifier, identifierType)
|
||||
if (extracted) return extracted
|
||||
}
|
||||
|
||||
const identifierValues = [
|
||||
...getValues(metaTags, 'dc:identifier'),
|
||||
...getValues(metaTags, 'ncc:identifier')
|
||||
]
|
||||
for (const identifier of identifierValues) {
|
||||
if (identifierType === 'isbn' && /isbn/i.test(identifier)) {
|
||||
const extracted = extractIdentifierValue(identifier, identifierType)
|
||||
if (extracted) return extracted
|
||||
}
|
||||
if (identifierType === 'asin' && /asin/i.test(identifier)) {
|
||||
const extracted = extractIdentifierValue(identifier, identifierType)
|
||||
if (extracted) return extracted
|
||||
}
|
||||
}
|
||||
|
||||
for (const identifier of identifierValues) {
|
||||
const extracted = extractIdentifierValue(identifier, identifierType)
|
||||
if (extracted) return extracted
|
||||
}
|
||||
return null
|
||||
}
|
||||
|
||||
function parseDaisyMetadata(htmlText) {
|
||||
if (!htmlText) return null
|
||||
|
||||
const metaTags = {}
|
||||
let titleText = ''
|
||||
let inTitle = false
|
||||
let currentHeadingName = null
|
||||
let currentHeadingText = ''
|
||||
const chapterTitles = []
|
||||
|
||||
const parser = new h.Parser(
|
||||
{
|
||||
onopentag: (name, attribs) => {
|
||||
if (name === 'title') {
|
||||
inTitle = true
|
||||
}
|
||||
if (/^h[1-6]$/.test(name)) {
|
||||
currentHeadingName = name
|
||||
currentHeadingText = ''
|
||||
}
|
||||
if (name !== 'meta') return
|
||||
|
||||
const tagName = attribs.name?.trim().toLowerCase()
|
||||
const content = attribs.content?.trim()
|
||||
if (!tagName || !content) return
|
||||
|
||||
if (!metaTags[tagName]) metaTags[tagName] = []
|
||||
metaTags[tagName].push(content)
|
||||
},
|
||||
ontext: (text) => {
|
||||
if (inTitle) titleText += text
|
||||
if (currentHeadingName) currentHeadingText += text
|
||||
},
|
||||
onclosetag: (name) => {
|
||||
if (name === 'title') {
|
||||
inTitle = false
|
||||
}
|
||||
if (name === currentHeadingName) {
|
||||
const chapterTitle = currentHeadingText.replace(/\s+/g, ' ').trim()
|
||||
if (chapterTitle) {
|
||||
chapterTitles.push(chapterTitle)
|
||||
}
|
||||
currentHeadingName = null
|
||||
currentHeadingText = ''
|
||||
}
|
||||
}
|
||||
},
|
||||
{ decodeEntities: true }
|
||||
)
|
||||
|
||||
parser.write(htmlText)
|
||||
parser.end()
|
||||
|
||||
const creators = parseNameValues(getValues(metaTags, 'dc:creator'))
|
||||
const narrators = parseNameValues(getValues(metaTags, 'ncc:narrator'))
|
||||
const subjects = parseStringList([
|
||||
...getValues(metaTags, 'dc:subject'),
|
||||
...getValues(metaTags, 'ncc:subject')
|
||||
])
|
||||
const tags = parseStringList([
|
||||
...getValues(metaTags, 'ncc:keywords'),
|
||||
...getValues(metaTags, 'dc:tag')
|
||||
])
|
||||
|
||||
const metadata = {
|
||||
title: getFirstValue(metaTags, ['dc:title']) || titleText.trim() || null,
|
||||
authors: creators,
|
||||
narrators,
|
||||
publishedYear: extractYear(getFirstValue(metaTags, ['dc:date', 'ncc:revisiondate'])),
|
||||
publisher: getFirstValue(metaTags, ['dc:publisher']),
|
||||
description: getFirstValue(metaTags, ['dc:description']),
|
||||
language: getFirstValue(metaTags, ['dc:language']),
|
||||
genres: subjects,
|
||||
tags,
|
||||
isbn: parseIdentifier(metaTags, 'isbn'),
|
||||
asin: parseIdentifier(metaTags, 'asin'),
|
||||
chapters: chapterTitles.map((title) => ({ title }))
|
||||
}
|
||||
|
||||
for (const key in metadata) {
|
||||
if (metadata[key] === null) {
|
||||
delete metadata[key]
|
||||
}
|
||||
}
|
||||
|
||||
return metadata
|
||||
}
|
||||
|
||||
module.exports = { parseDaisyMetadata }
|
||||
|
|
@ -24,7 +24,10 @@ function isMediaFile(mediaType, ext, audiobooksOnly = false) {
|
|||
return globals.SupportedAudioTypes.includes(extclean) || globals.SupportedEbookTypes.includes(extclean)
|
||||
}
|
||||
|
||||
function isScannableNonMediaFile(ext) {
|
||||
function isScannableNonMediaFile(ext, filename = '') {
|
||||
const filenameLower = filename.toLowerCase()
|
||||
if (filenameLower === 'ncc.html') return true
|
||||
|
||||
if (!ext) return false
|
||||
const extclean = ext.slice(1).toLowerCase()
|
||||
return globals.TextFileTypes.includes(extclean) || globals.MetadataFileTypes.includes(extclean) || globals.SupportedImageTypes.includes(extclean)
|
||||
|
|
@ -58,7 +61,7 @@ function groupFileItemsIntoLibraryItemDirs(mediaType, fileItems, audiobooksOnly,
|
|||
/** @type {import('./fileUtils').FilePathItem[]} */
|
||||
const otherFileItems = []
|
||||
itemsFiltered.forEach((item) => {
|
||||
if (isMediaFile(mediaType, item.extension, audiobooksOnly) || (includeNonMediaFiles && isScannableNonMediaFile(item.extension))) {
|
||||
if (isMediaFile(mediaType, item.extension, audiobooksOnly) || (includeNonMediaFiles && isScannableNonMediaFile(item.extension, item.name))) {
|
||||
mediaFileItems.push(item)
|
||||
} else {
|
||||
otherFileItems.push(item)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue