first iteration of parsing metadata and chapter names from ncc.html file

2026-07-07 09:51:37 +00:00 · 2026-02-07 16:45:40 +01:00 · 2026-02-07 16:45:40 +01:00 · f157e63fd7
commit f157e63fd7
parent 9defe67fe9
10 changed files with 394 additions and 6 deletions
--- a/client/components/modals/libraries/EditModal.vue
+++ b/client/components/modals/libraries/EditModal.vue
@ -127,7 +127,7 @@ export default {
          autoScanCronExpression: null,
          hideSingleBookSeries: false,
          onlyShowLaterBooksInContinueSeries: false,
-          metadataPrecedence: ['folderStructure', 'audioMetatags', 'nfoFile', 'txtFiles', 'opfFile', 'absMetadata'],
+          metadataPrecedence: ['folderStructure', 'audioMetatags', 'nfoFile', 'txtFiles', 'opfFile', 'daisyFile', 'absMetadata'],
          markAsFinishedPercentComplete: null,
          markAsFinishedTimeRemaining: 10
        }
--- a/client/components/modals/libraries/LibraryScannerSettings.vue
+++ b/client/components/modals/libraries/LibraryScannerSettings.vue
@ -81,6 +81,11 @@ export default {
          name: 'OPF file',
          include: true
        },
+        daisyFile: {
+          id: 'daisyFile',
+          name: 'DAISY ncc.html file',
+          include: true
+        },
        absMetadata: {
          id: 'absMetadata',
          name: 'Audiobookshelf metadata file',
@ -157,4 +162,4 @@ export default {
    this.init()
  }
 }
-</script>
+</script>
--- a/server/models/Library.js
+++ b/server/models/Library.js
@ -82,7 +82,7 @@ class Library extends Model {
  }

  static get defaultMetadataPrecedence() {
-    return ['folderStructure', 'audioMetatags', 'nfoFile', 'txtFiles', 'opfFile', 'absMetadata']
+    return ['folderStructure', 'audioMetatags', 'nfoFile', 'txtFiles', 'opfFile', 'daisyFile', 'absMetadata']
  }

  /**
--- a/server/scanner/BookScanner.js
+++ b/server/scanner/BookScanner.js
@ -23,6 +23,7 @@ const CoverManager = require('../managers/CoverManager')
 const LibraryScan = require('./LibraryScan')
 const OpfFileScanner = require('./OpfFileScanner')
 const NfoFileScanner = require('./NfoFileScanner')
+const DaisyFileScanner = require('./DaisyFileScanner')
 const AbsMetadataFileScanner = require('./AbsMetadataFileScanner')

 /**
@ -792,6 +793,14 @@ class BookScanner {
      await OpfFileScanner.scanBookOpfFile(this.libraryItemData.metadataOpfLibraryFile, this.bookMetadata)
    }

+    /**
+     * Metadata from DAISY ncc.html file
+     */
+    async daisyFile() {
+      if (!this.libraryItemData.metadataDaisyNccLibraryFile) return
+      await DaisyFileScanner.scanBookDaisyFile(this.libraryItemData.metadataDaisyNccLibraryFile, this.bookMetadata, this.audioFiles)
+    }
+
    /**
     * Metadata from metadata.json
     */
--- a/server/scanner/DaisyFileScanner.js
+++ b/server/scanner/DaisyFileScanner.js
@ -0,0 +1,99 @@
+const { parseDaisyMetadata } = require('../utils/parsers/parseDaisyMetadata')
+const { readTextFile } = require('../utils/fileUtils')
+const Path = require('path')
+
+class DaisyFileScanner {
+  constructor() {}
+
+  /**
+   * Parse metadata from DAISY ncc.html file found in library scan and update bookMetadata
+   *
+   * @param {import('../models/LibraryItem').LibraryFileObject} daisyLibraryFileObj
+   * @param {Object} bookMetadata
+   */
+  async scanBookDaisyFile(daisyLibraryFileObj, bookMetadata, audioFiles = []) {
+    const htmlText = await readTextFile(daisyLibraryFileObj.metadata.path)
+    const daisyMetadata = htmlText ? parseDaisyMetadata(htmlText) : null
+    if (daisyMetadata) {
+      for (const key in daisyMetadata) {
+        if (key === 'tags') {
+          if (daisyMetadata.tags.length) {
+            bookMetadata.tags = daisyMetadata.tags
+          }
+        } else if (key === 'genres') {
+          if (daisyMetadata.genres.length) {
+            bookMetadata.genres = daisyMetadata.genres
+          }
+        } else if (key === 'authors') {
+          if (daisyMetadata.authors?.length) {
+            bookMetadata.authors = daisyMetadata.authors
+          }
+        } else if (key === 'narrators') {
+          if (daisyMetadata.narrators?.length) {
+            bookMetadata.narrators = daisyMetadata.narrators
+          }
+        } else if (key === 'chapters') {
+          if (!daisyMetadata.chapters?.length) continue
+
+          // DAISY ncc.html provides chapter names; preserve existing timings if available.
+          if (bookMetadata.chapters?.length) {
+            const updatedChapters = bookMetadata.chapters.map((chapter, index) => {
+              const daisyChapter = daisyMetadata.chapters[index]
+              if (!daisyChapter?.title) return chapter
+              return {
+                ...chapter,
+                id: chapter.id ?? index,
+                title: daisyChapter.title
+              }
+            })
+            bookMetadata.chapters = updatedChapters
+          } else {
+            const chaptersFromFiles = this.buildChaptersFromAudioFiles(audioFiles, daisyMetadata.chapters)
+            if (chaptersFromFiles.length) {
+              bookMetadata.chapters = chaptersFromFiles
+            }
+          }
+        } else if (daisyMetadata[key]) {
+          bookMetadata[key] = daisyMetadata[key]
+        }
+      }
+    }
+  }
+
+  /**
+   * Build chapter timings from ordered audio files while applying DAISY chapter titles.
+   * Falls back to file basenames if DAISY has fewer titles than files.
+   *
+   * @param {import('../models/Book').AudioFileObject[]} audioFiles
+   * @param {{title:string}[]} daisyChapters
+   * @returns {import('../models/Book').ChapterObject[]}
+   */
+  buildChaptersFromAudioFiles(audioFiles, daisyChapters) {
+    if (!audioFiles?.length) return []
+
+    const chapters = []
+    let currentStartTime = 0
+    let chapterId = 0
+
+    audioFiles.forEach((audioFile) => {
+      if (!audioFile.duration) return
+
+      const fallbackTitle = audioFile.metadata?.filename
+        ? Path.basename(audioFile.metadata.filename, Path.extname(audioFile.metadata.filename))
+        : `Chapter ${chapterId + 1}`
+      const title = daisyChapters[chapterId]?.title || fallbackTitle
+
+      chapters.push({
+        id: chapterId++,
+        start: currentStartTime,
+        end: currentStartTime + audioFile.duration,
+        title
+      })
+
+      currentStartTime += audioFile.duration
+    })
+
+    return chapters
+  }
+}
+module.exports = new DaisyFileScanner()
--- a/server/scanner/LibraryItemScanData.js
+++ b/server/scanner/LibraryItemScanData.js
@ -173,6 +173,11 @@ class LibraryItemScanData {
    return this.libraryFiles.find(lf => lf.metadata.ext.toLowerCase() === '.nfo')
  }

+  /** @type {LibraryItem.LibraryFileObject} */
+  get metadataDaisyNccLibraryFile() {
+    return this.libraryFiles.find(lf => lf.metadata.filename?.toLowerCase() === 'ncc.html')
+  }
+
  /**
   * 
   * @param {LibraryItem} existingLibraryItem 
@ -374,4 +379,4 @@ class LibraryItemScanData {
    }
  }
 }
-module.exports = LibraryItemScanData
+module.exports = LibraryItemScanData
--- a/server/utils/parsers/parseDaisyMetadata.js
+++ b/server/utils/parsers/parseDaisyMetadata.js
@ -0,0 +1,176 @@
+const h = require('htmlparser2')
+const parseNameString = require('./parseNameString')
+
+function getValues(metaTags, tagName) {
+  return metaTags[tagName]?.filter((v) => v) || []
+}
+
+function getFirstValue(metaTags, tagNames) {
+  for (const tagName of tagNames) {
+    const values = getValues(metaTags, tagName)
+    if (values.length) return values[0]
+  }
+  return null
+}
+
+function parseNameValues(values) {
+  const names = []
+  values.forEach((value) => {
+    const parsedNames = parseNameString.parse(value)?.names || value.split(/\s*;\s*/).filter((n) => n)
+    parsedNames.forEach((name) => {
+      if (!names.includes(name)) names.push(name)
+    })
+  })
+  return names
+}
+
+function parseStringList(values) {
+  const items = []
+  values.forEach((value) => {
+    value.split(/\s*[;,]\s*/).forEach((item) => {
+      if (item && !items.includes(item)) {
+        items.push(item)
+      }
+    })
+  })
+  return items
+}
+
+function extractYear(str) {
+  if (!str) return null
+  const match = str.match(/\d{4}/)
+  return match ? match[0] : null
+}
+
+function extractIdentifierValue(identifier, identifierType) {
+  if (!identifier) return null
+
+  const value = identifier.trim()
+  const expression = identifierType === 'isbn'
+    ? /(?:^|[^a-z0-9])(97[89][\d\- ]{9,16}[\dx]|[\d\- ]{9,14}[\dx])(?:$|[^a-z0-9])/i
+    : /(?:^|[^a-z0-9])([a-z0-9]{10})(?:$|[^a-z0-9])/i
+
+  const match = value.match(expression)
+  if (!match) return null
+  return (match[1] || match[0]).replace(/^[^a-z0-9]+|[^a-z0-9]+$/gi, '').trim()
+}
+
+function parseIdentifier(metaTags, identifierType) {
+  const typeTag = identifierType === 'isbn' ? 'dc:identifier:isbn' : 'dc:identifier:asin'
+  const typedIdentifier = getFirstValue(metaTags, [typeTag, identifierType])
+  if (typedIdentifier) {
+    const extracted = extractIdentifierValue(typedIdentifier, identifierType)
+    if (extracted) return extracted
+  }
+
+  const identifierValues = [
+    ...getValues(metaTags, 'dc:identifier'),
+    ...getValues(metaTags, 'ncc:identifier')
+  ]
+  for (const identifier of identifierValues) {
+    if (identifierType === 'isbn' && /isbn/i.test(identifier)) {
+      const extracted = extractIdentifierValue(identifier, identifierType)
+      if (extracted) return extracted
+    }
+    if (identifierType === 'asin' && /asin/i.test(identifier)) {
+      const extracted = extractIdentifierValue(identifier, identifierType)
+      if (extracted) return extracted
+    }
+  }
+
+  for (const identifier of identifierValues) {
+    const extracted = extractIdentifierValue(identifier, identifierType)
+    if (extracted) return extracted
+  }
+  return null
+}
+
+function parseDaisyMetadata(htmlText) {
+  if (!htmlText) return null
+
+  const metaTags = {}
+  let titleText = ''
+  let inTitle = false
+  let currentHeadingName = null
+  let currentHeadingText = ''
+  const chapterTitles = []
+
+  const parser = new h.Parser(
+    {
+      onopentag: (name, attribs) => {
+        if (name === 'title') {
+          inTitle = true
+        }
+        if (/^h[1-6]$/.test(name)) {
+          currentHeadingName = name
+          currentHeadingText = ''
+        }
+        if (name !== 'meta') return
+
+        const tagName = attribs.name?.trim().toLowerCase()
+        const content = attribs.content?.trim()
+        if (!tagName || !content) return
+
+        if (!metaTags[tagName]) metaTags[tagName] = []
+        metaTags[tagName].push(content)
+      },
+      ontext: (text) => {
+        if (inTitle) titleText += text
+        if (currentHeadingName) currentHeadingText += text
+      },
+      onclosetag: (name) => {
+        if (name === 'title') {
+          inTitle = false
+        }
+        if (name === currentHeadingName) {
+          const chapterTitle = currentHeadingText.replace(/\s+/g, ' ').trim()
+          if (chapterTitle) {
+            chapterTitles.push(chapterTitle)
+          }
+          currentHeadingName = null
+          currentHeadingText = ''
+        }
+      }
+    },
+    { decodeEntities: true }
+  )
+
+  parser.write(htmlText)
+  parser.end()
+
+  const creators = parseNameValues(getValues(metaTags, 'dc:creator'))
+  const narrators = parseNameValues(getValues(metaTags, 'ncc:narrator'))
+  const subjects = parseStringList([
+    ...getValues(metaTags, 'dc:subject'),
+    ...getValues(metaTags, 'ncc:subject')
+  ])
+  const tags = parseStringList([
+    ...getValues(metaTags, 'ncc:keywords'),
+    ...getValues(metaTags, 'dc:tag')
+  ])
+
+  const metadata = {
+    title: getFirstValue(metaTags, ['dc:title']) || titleText.trim() || null,
+    authors: creators,
+    narrators,
+    publishedYear: extractYear(getFirstValue(metaTags, ['dc:date', 'ncc:revisiondate'])),
+    publisher: getFirstValue(metaTags, ['dc:publisher']),
+    description: getFirstValue(metaTags, ['dc:description']),
+    language: getFirstValue(metaTags, ['dc:language']),
+    genres: subjects,
+    tags,
+    isbn: parseIdentifier(metaTags, 'isbn'),
+    asin: parseIdentifier(metaTags, 'asin'),
+    chapters: chapterTitles.map((title) => ({ title }))
+  }
+
+  for (const key in metadata) {
+    if (metadata[key] === null) {
+      delete metadata[key]
+    }
+  }
+
+  return metadata
+}
+
+module.exports = { parseDaisyMetadata }
--- a/server/utils/scandir.js
+++ b/server/utils/scandir.js
@ -24,7 +24,10 @@ function isMediaFile(mediaType, ext, audiobooksOnly = false) {
  return globals.SupportedAudioTypes.includes(extclean) || globals.SupportedEbookTypes.includes(extclean)
 }

-function isScannableNonMediaFile(ext) {
+function isScannableNonMediaFile(ext, filename = '') {
+  const filenameLower = filename.toLowerCase()
+  if (filenameLower === 'ncc.html') return true
+
  if (!ext) return false
  const extclean = ext.slice(1).toLowerCase()
  return globals.TextFileTypes.includes(extclean) || globals.MetadataFileTypes.includes(extclean) || globals.SupportedImageTypes.includes(extclean)
@ -58,7 +61,7 @@ function groupFileItemsIntoLibraryItemDirs(mediaType, fileItems, audiobooksOnly,
  /** @type {import('./fileUtils').FilePathItem[]} */
  const otherFileItems = []
  itemsFiltered.forEach((item) => {
-    if (isMediaFile(mediaType, item.extension, audiobooksOnly) || (includeNonMediaFiles && isScannableNonMediaFile(item.extension))) {
+    if (isMediaFile(mediaType, item.extension, audiobooksOnly) || (includeNonMediaFiles && isScannableNonMediaFile(item.extension, item.name))) {
      mediaFileItems.push(item)
    } else {
      otherFileItems.push(item)
--- a/test/server/utils/parsers/parseDaisyMetadata.test.js
+++ b/test/server/utils/parsers/parseDaisyMetadata.test.js
@ -0,0 +1,73 @@
+const chai = require('chai')
+const expect = chai.expect
+const { parseDaisyMetadata } = require('../../../../server/utils/parsers/parseDaisyMetadata')
+
+describe('parseDaisyMetadata', () => {
+  it('returns null if htmlText is empty', () => {
+    const result = parseDaisyMetadata('')
+    expect(result).to.be.null
+  })
+
+  it('parses common metadata values from DAISY ncc.html', () => {
+    const nccHtml = `
+      <html>
+        <head>
+          <title>Fallback Title</title>
+          <meta name="dc:title" content="The DAISY Book">
+          <meta name="dc:creator" content="Jane Doe & Richard Roe">
+          <meta name="ncc:narrator" content="Reader One; Reader Two">
+          <meta name="dc:publisher" content="Talking Books Inc">
+          <meta name="dc:date" content="2021-06-04">
+          <meta name="dc:language" content="en">
+          <meta name="dc:subject" content="Fiction, Mystery">
+          <meta name="ncc:keywords" content="audio; daisy">
+          <meta name="dc:identifier" content="ISBN 978-1-4028-9462-6">
+          <meta name="dc:identifier:asin" content="ASIN: B012345678">
+        </head>
+      </html>
+    `
+
+    const result = parseDaisyMetadata(nccHtml)
+    expect(result.title).to.equal('The DAISY Book')
+    expect(result.authors).to.deep.equal(['Jane Doe', 'Richard Roe'])
+    expect(result.narrators).to.deep.equal(['Reader One', 'Reader Two'])
+    expect(result.publisher).to.equal('Talking Books Inc')
+    expect(result.publishedYear).to.equal('2021')
+    expect(result.language).to.equal('en')
+    expect(result.genres).to.deep.equal(['Fiction', 'Mystery'])
+    expect(result.tags).to.deep.equal(['audio', 'daisy'])
+    expect(result.isbn).to.equal('978-1-4028-9462-6')
+    expect(result.asin).to.equal('B012345678')
+  })
+
+  it('falls back to title tag when dc:title is not set', () => {
+    const nccHtml = `
+      <html>
+        <head>
+          <title>Title From Head</title>
+        </head>
+      </html>
+    `
+    const result = parseDaisyMetadata(nccHtml)
+    expect(result.title).to.equal('Title From Head')
+  })
+
+  it('parses chapter names from heading entries in ncc.html', () => {
+    const nccHtml = `
+      <html>
+        <body>
+          <h1><a href="book.smil#id1">Chapter 1</a></h1>
+          <h2><a href="book.smil#id2">Chapter 2: The Road</a></h2>
+          <h3>Part 1</h3>
+        </body>
+      </html>
+    `
+
+    const result = parseDaisyMetadata(nccHtml)
+    expect(result.chapters).to.deep.equal([
+      { title: 'Chapter 1' },
+      { title: 'Chapter 2: The Road' },
+      { title: 'Part 1' }
+    ])
+  })
+})
--- a/test/server/utils/scandir.test.js
+++ b/test/server/utils/scandir.test.js
@ -49,4 +49,22 @@ describe('scanUtils', async () => {
      'Author/Series2/Book5/deeply/nested': ['cd 01/audiofile.mp3', 'cd 02/audiofile.mp3']
    })
  })
+
+  it('should include DAISY ncc.html changes when includeNonMediaFiles is enabled', async () => {
+    const filePath = 'Author/Book3/ncc.html'
+    const dirname = Path.dirname(filePath)
+    const fileItems = [
+      {
+        name: Path.basename(filePath),
+        reldirpath: dirname === '.' ? '' : dirname,
+        extension: Path.extname(filePath),
+        deep: filePath.split('/').length - 1
+      }
+    ]
+
+    const libraryItemGrouping = scanUtils.groupFileItemsIntoLibraryItemDirs('book', fileItems, false, true)
+    expect(libraryItemGrouping).to.deep.equal({
+      'Author/Book3': ['ncc.html']
+    })
+  })
 })