first iteration of parsing metadata and chapter names from ncc.html file

2026-05-16 00:11:30 +00:00 · 2026-02-07 16:45:40 +01:00 · 2026-02-07 16:45:40 +01:00 · 6c9bf8c2bd
commit 6c9bf8c2bd
parent fe13456a2b
10 changed files with 394 additions and 6 deletions
--- a/server/utils/parsers/parseDaisyMetadata.js
+++ b/server/utils/parsers/parseDaisyMetadata.js
@ -0,0 +1,176 @@
+const h = require('htmlparser2')
+const parseNameString = require('./parseNameString')
+
+function getValues(metaTags, tagName) {
+  return metaTags[tagName]?.filter((v) => v) || []
+}
+
+function getFirstValue(metaTags, tagNames) {
+  for (const tagName of tagNames) {
+    const values = getValues(metaTags, tagName)
+    if (values.length) return values[0]
+  }
+  return null
+}
+
+function parseNameValues(values) {
+  const names = []
+  values.forEach((value) => {
+    const parsedNames = parseNameString.parse(value)?.names || value.split(/\s*;\s*/).filter((n) => n)
+    parsedNames.forEach((name) => {
+      if (!names.includes(name)) names.push(name)
+    })
+  })
+  return names
+}
+
+function parseStringList(values) {
+  const items = []
+  values.forEach((value) => {
+    value.split(/\s*[;,]\s*/).forEach((item) => {
+      if (item && !items.includes(item)) {
+        items.push(item)
+      }
+    })
+  })
+  return items
+}
+
+function extractYear(str) {
+  if (!str) return null
+  const match = str.match(/\d{4}/)
+  return match ? match[0] : null
+}
+
+function extractIdentifierValue(identifier, identifierType) {
+  if (!identifier) return null
+
+  const value = identifier.trim()
+  const expression = identifierType === 'isbn'
+    ? /(?:^|[^a-z0-9])(97[89][\d\- ]{9,16}[\dx]|[\d\- ]{9,14}[\dx])(?:$|[^a-z0-9])/i
+    : /(?:^|[^a-z0-9])([a-z0-9]{10})(?:$|[^a-z0-9])/i
+
+  const match = value.match(expression)
+  if (!match) return null
+  return (match[1] || match[0]).replace(/^[^a-z0-9]+|[^a-z0-9]+$/gi, '').trim()
+}
+
+function parseIdentifier(metaTags, identifierType) {
+  const typeTag = identifierType === 'isbn' ? 'dc:identifier:isbn' : 'dc:identifier:asin'
+  const typedIdentifier = getFirstValue(metaTags, [typeTag, identifierType])
+  if (typedIdentifier) {
+    const extracted = extractIdentifierValue(typedIdentifier, identifierType)
+    if (extracted) return extracted
+  }
+
+  const identifierValues = [
+    ...getValues(metaTags, 'dc:identifier'),
+    ...getValues(metaTags, 'ncc:identifier')
+  ]
+  for (const identifier of identifierValues) {
+    if (identifierType === 'isbn' && /isbn/i.test(identifier)) {
+      const extracted = extractIdentifierValue(identifier, identifierType)
+      if (extracted) return extracted
+    }
+    if (identifierType === 'asin' && /asin/i.test(identifier)) {
+      const extracted = extractIdentifierValue(identifier, identifierType)
+      if (extracted) return extracted
+    }
+  }
+
+  for (const identifier of identifierValues) {
+    const extracted = extractIdentifierValue(identifier, identifierType)
+    if (extracted) return extracted
+  }
+  return null
+}
+
+function parseDaisyMetadata(htmlText) {
+  if (!htmlText) return null
+
+  const metaTags = {}
+  let titleText = ''
+  let inTitle = false
+  let currentHeadingName = null
+  let currentHeadingText = ''
+  const chapterTitles = []
+
+  const parser = new h.Parser(
+    {
+      onopentag: (name, attribs) => {
+        if (name === 'title') {
+          inTitle = true
+        }
+        if (/^h[1-6]$/.test(name)) {
+          currentHeadingName = name
+          currentHeadingText = ''
+        }
+        if (name !== 'meta') return
+
+        const tagName = attribs.name?.trim().toLowerCase()
+        const content = attribs.content?.trim()
+        if (!tagName || !content) return
+
+        if (!metaTags[tagName]) metaTags[tagName] = []
+        metaTags[tagName].push(content)
+      },
+      ontext: (text) => {
+        if (inTitle) titleText += text
+        if (currentHeadingName) currentHeadingText += text
+      },
+      onclosetag: (name) => {
+        if (name === 'title') {
+          inTitle = false
+        }
+        if (name === currentHeadingName) {
+          const chapterTitle = currentHeadingText.replace(/\s+/g, ' ').trim()
+          if (chapterTitle) {
+            chapterTitles.push(chapterTitle)
+          }
+          currentHeadingName = null
+          currentHeadingText = ''
+        }
+      }
+    },
+    { decodeEntities: true }
+  )
+
+  parser.write(htmlText)
+  parser.end()
+
+  const creators = parseNameValues(getValues(metaTags, 'dc:creator'))
+  const narrators = parseNameValues(getValues(metaTags, 'ncc:narrator'))
+  const subjects = parseStringList([
+    ...getValues(metaTags, 'dc:subject'),
+    ...getValues(metaTags, 'ncc:subject')
+  ])
+  const tags = parseStringList([
+    ...getValues(metaTags, 'ncc:keywords'),
+    ...getValues(metaTags, 'dc:tag')
+  ])
+
+  const metadata = {
+    title: getFirstValue(metaTags, ['dc:title']) || titleText.trim() || null,
+    authors: creators,
+    narrators,
+    publishedYear: extractYear(getFirstValue(metaTags, ['dc:date', 'ncc:revisiondate'])),
+    publisher: getFirstValue(metaTags, ['dc:publisher']),
+    description: getFirstValue(metaTags, ['dc:description']),
+    language: getFirstValue(metaTags, ['dc:language']),
+    genres: subjects,
+    tags,
+    isbn: parseIdentifier(metaTags, 'isbn'),
+    asin: parseIdentifier(metaTags, 'asin'),
+    chapters: chapterTitles.map((title) => ({ title }))
+  }
+
+  for (const key in metadata) {
+    if (metadata[key] === null) {
+      delete metadata[key]
+    }
+  }
+
+  return metadata
+}
+
+module.exports = { parseDaisyMetadata }
--- a/server/utils/scandir.js
+++ b/server/utils/scandir.js
@ -24,7 +24,10 @@ function isMediaFile(mediaType, ext, audiobooksOnly = false) {
  return globals.SupportedAudioTypes.includes(extclean) || globals.SupportedEbookTypes.includes(extclean)
 }

-function isScannableNonMediaFile(ext) {
+function isScannableNonMediaFile(ext, filename = '') {
+  const filenameLower = filename.toLowerCase()
+  if (filenameLower === 'ncc.html') return true
+
  if (!ext) return false
  const extclean = ext.slice(1).toLowerCase()
  return globals.TextFileTypes.includes(extclean) || globals.MetadataFileTypes.includes(extclean) || globals.SupportedImageTypes.includes(extclean)
@ -58,7 +61,7 @@ function groupFileItemsIntoLibraryItemDirs(mediaType, fileItems, audiobooksOnly,
  /** @type {import('./fileUtils').FilePathItem[]} */
  const otherFileItems = []
  itemsFiltered.forEach((item) => {
-    if (isMediaFile(mediaType, item.extension, audiobooksOnly) || (includeNonMediaFiles && isScannableNonMediaFile(item.extension))) {
+    if (isMediaFile(mediaType, item.extension, audiobooksOnly) || (includeNonMediaFiles && isScannableNonMediaFile(item.extension, item.name))) {
      mediaFileItems.push(item)
    } else {
      otherFileItems.push(item)