From b5e620b981a3d11ec2faf3d62af2887603a59449 Mon Sep 17 00:00:00 2001 From: korjik Date: Wed, 22 Apr 2026 10:00:03 -0700 Subject: [PATCH] update --- .../modals/libraries/LibraryTools.vue | 45 +++++ server/controllers/LibraryController.js | 176 ++++++++++++++++++ server/providers/OpenAI.js | 140 ++++++++++++++ server/routers/ApiRouter.js | 1 + test/server/providers/OpenAI.test.js | 57 ++++++ 5 files changed, 419 insertions(+) diff --git a/client/components/modals/libraries/LibraryTools.vue b/client/components/modals/libraries/LibraryTools.vue index 00ed4f8d..1dd5e2ba 100644 --- a/client/components/modals/libraries/LibraryTools.vue +++ b/client/components/modals/libraries/LibraryTools.vue @@ -15,6 +15,20 @@

Configure OpenAI first in server settings.

+
+
+
+

Dedupe Books With AI

+

Analyze likely duplicate books in this library with OpenAI, keep the best copy, and remove the duplicate items. This deletes duplicate files from disk.

+
+
+
+ Dedupe Books +
+
+

Configure OpenAI first in server settings.

+
+
@@ -83,6 +97,18 @@ export default { } this.$store.commit('globals/setConfirmPrompt', payload) }, + dedupeBooksWithAI() { + const payload = { + message: 'Deduplicate books in this library with AI? Duplicate items chosen for removal will be deleted from the database and file system.', + callback: (confirmed) => { + if (confirmed) { + this.runBookDedupe() + } + }, + type: 'yesNo' + } + this.$store.commit('globals/setConfirmPrompt', payload) + }, runSeriesDetection(onlyMissing = true) { this.$emit('update:processing', true) this.$axios @@ -102,6 +128,25 @@ export default { this.$emit('update:processing', false) }) }, + runBookDedupe() { + this.$emit('update:processing', true) + this.$axios + .$post(`/api/libraries/${this.libraryId}/dedupe-books-with-ai?hard=1`) + .then((data) => { + if (!data.duplicatesRemoved) { + this.$toast.info(this.$strings.ToastNoUpdatesNecessary) + } else { + this.$toast.success(`AI removed ${data.duplicatesRemoved} duplicate books`) + } + }) + .catch((error) => { + console.error('Failed to dedupe books with AI', error) + this.$toast.error(error.response?.data || this.$strings.ToastFailedToUpdate) + }) + .finally(() => { + this.$emit('update:processing', false) + }) + }, removeAllMetadataClick(ext) { const payload = { message: this.$getString('MessageConfirmRemoveMetadataFiles', [ext]), diff --git a/server/controllers/LibraryController.js b/server/controllers/LibraryController.js index 045b6651..ea01a7be 100644 --- a/server/controllers/LibraryController.js +++ b/server/controllers/LibraryController.js @@ -1565,6 +1565,97 @@ class LibraryController { return [...groups.values()] } + normalizeBookTitleForAIDedupe(title) { + if (!title || typeof title !== 'string') return null + return title + .toLowerCase() + .replace(/\([^)]*\)/g, ' ') + .replace(/\[[^\]]*]/g, ' ') + .replace(/\b(unabridged|abridged|audiobook|audio book)\b/g, ' ') + .replace(/[^a-z0-9]+/g, ' ') + .replace(/\s+/g, ' ') + .trim() + } + + groupLibraryBooksForAIDedupe(libraryItems) { + const parent = new Map() + const find = (id) => { + if (parent.get(id) !== id) { + parent.set(id, find(parent.get(id))) + } + return parent.get(id) + } + const union = (a, b) => { + const rootA = find(a) + const rootB = find(b) + if (rootA !== rootB) parent.set(rootB, rootA) + } + + libraryItems.forEach((libraryItem) => parent.set(libraryItem.id, libraryItem.id)) + + const candidateMaps = [new Map(), new Map(), new Map()] + libraryItems.forEach((libraryItem) => { + const metadata = libraryItem.media.oldMetadataToJSON() + const primaryAuthor = metadata.authors?.[0]?.name?.trim().toLowerCase() || null + const normalizedTitle = LibraryController.prototype.normalizeBookTitleForAIDedupe.call(this, metadata.title || '') + const isbn = metadata.isbn?.replace(/[-\s]/g, '').toLowerCase() || null + const asin = metadata.asin?.trim().toLowerCase() || null + + const candidateKeys = [] + if (primaryAuthor && normalizedTitle) candidateKeys.push([candidateMaps[0], `${primaryAuthor}::${normalizedTitle}`]) + if (isbn) candidateKeys.push([candidateMaps[1], `isbn::${isbn}`]) + if (asin) candidateKeys.push([candidateMaps[2], `asin::${asin}`]) + + candidateKeys.forEach(([candidateMap, key]) => { + if (!candidateMap.has(key)) candidateMap.set(key, []) + candidateMap.get(key).push(libraryItem) + }) + }) + + candidateMaps.forEach((candidateMap) => { + candidateMap.forEach((groupItems) => { + if (groupItems.length < 2) return + for (let i = 1; i < groupItems.length; i++) { + union(groupItems[0].id, groupItems[i].id) + } + }) + }) + + const grouped = new Map() + libraryItems.forEach((libraryItem) => { + const root = find(libraryItem.id) + if (!grouped.has(root)) grouped.set(root, []) + grouped.get(root).push(libraryItem) + }) + + return [...grouped.values()] + .filter((groupItems) => groupItems.length > 1) + .map((groupItems) => ({ + label: groupItems.map((libraryItem) => libraryItem.media.title).join(' | '), + libraryItems: groupItems.sort((a, b) => a.media.title.localeCompare(b.media.title)) + })) + } + + getDeleteDependenciesForLibraryItem(libraryItem) { + const mediaItemIds = [] + const authorIds = [] + const seriesIds = [] + + mediaItemIds.push(libraryItem.media.id) + if (libraryItem.media.authors?.length) { + authorIds.push(...libraryItem.media.authors.map((author) => author.id)) + } + if (libraryItem.media.series?.length) { + seriesIds.push(...libraryItem.media.series.map((series) => series.id)) + } + + return { + mediaItemIds, + authorIds, + seriesIds + } + } + /** * POST: /api/libraries/:id/detect-series-with-ai * @@ -1706,6 +1797,91 @@ class LibraryController { } } + /** + * POST: /api/libraries/:id/dedupe-books-with-ai + * + * @this {import('../routers/ApiRouter')} + * + * @param {LibraryControllerRequest} req + * @param {Response} res + */ + async dedupeBooksWithAI(req, res) { + if (!req.user.canUpdate) { + Logger.warn(`[LibraryController] User "${req.user.username}" attempted AI dedupe without update permissions`) + return res.sendStatus(403) + } + if (req.library.mediaType !== 'book') { + return res.status(400).send('AI book dedupe is only available for book libraries') + } + if (!openAI.isConfigured) { + return res.status(400).send('OpenAI is not configured') + } + + const hardDelete = req.query.hard !== '0' + + try { + const libraryItems = await LibraryController.prototype.getLibraryBooksForAISeriesDetection.call(this, req.library.id) + const candidateGroups = LibraryController.prototype.groupLibraryBooksForAIDedupe.call(this, libraryItems) + + let groupsProcessed = 0 + let duplicatesRemoved = 0 + const removedIds = new Set() + const authorIdsToCheck = new Set() + const seriesIdsToCheck = new Set() + + for (const candidateGroup of candidateGroups) { + const activeLibraryItems = candidateGroup.libraryItems.filter((libraryItem) => !removedIds.has(libraryItem.id)) + if (activeLibraryItems.length < 2) continue + + Logger.info(`[LibraryController] AI dedupe evaluating candidate group "${candidateGroup.label}" with ${activeLibraryItems.length} books`) + const decisions = await openAI.detectDuplicateBooks(activeLibraryItems) + groupsProcessed++ + + for (const decision of decisions) { + for (const duplicateId of decision.duplicateIds) { + if (removedIds.has(duplicateId) || duplicateId === decision.keepId) continue + const duplicateItem = activeLibraryItems.find((libraryItem) => libraryItem.id === duplicateId) + if (!duplicateItem) continue + + Logger.info( + `[LibraryController] AI dedupe removing duplicate "${duplicateItem.media.title}" (${duplicateItem.id}) keeping "${decision.keepId}" reason="${decision.reason || ''}"` + ) + + const deleteDependencies = LibraryController.prototype.getDeleteDependenciesForLibraryItem.call(this, duplicateItem) + await this.handleDeleteLibraryItem(duplicateItem.id, deleteDependencies.mediaItemIds, req.library.id) + if (hardDelete) { + await fs.remove(duplicateItem.path).catch((error) => { + Logger.error(`[LibraryController] Failed to hard-delete duplicate item path "${duplicateItem.path}"`, error) + }) + } + + deleteDependencies.authorIds.forEach((authorId) => authorIdsToCheck.add(authorId)) + deleteDependencies.seriesIds.forEach((seriesId) => seriesIdsToCheck.add(seriesId)) + removedIds.add(duplicateItem.id) + duplicatesRemoved++ + } + } + } + + await this.checkRemoveAuthorsWithNoBooks([...authorIdsToCheck]) + await this.checkRemoveEmptySeries([...seriesIdsToCheck]) + await Database.resetLibraryIssuesFilterData(req.library.id) + + Logger.info( + `[LibraryController] AI book dedupe completed for library "${req.library.name}" - groupsProcessed=${groupsProcessed}, duplicatesRemoved=${duplicatesRemoved}, hardDelete=${hardDelete}` + ) + + res.json({ + groupsProcessed, + duplicatesRemoved, + hardDelete + }) + } catch (error) { + Logger.error(`[LibraryController] Failed AI dedupe for library "${req.library.name}"`, error) + res.status(500).send(error.message || 'Failed to dedupe books with AI') + } + } + /** * * @param {RequestWithUser} req diff --git a/server/providers/OpenAI.js b/server/providers/OpenAI.js index 6d08cd99..1b301282 100644 --- a/server/providers/OpenAI.js +++ b/server/providers/OpenAI.js @@ -50,6 +50,14 @@ class OpenAI { }) } + summarizeDuplicateDecisionForLog(decision) { + return JSON.stringify({ + keepId: decision.keepId, + duplicateIds: decision.duplicateIds, + reason: decision.reason || '' + }) + } + normalizePathForPrompt(filePath) { if (!filePath || typeof filePath !== 'string') return null return filePath.replace(/\\/g, '/') @@ -316,6 +324,51 @@ class OpenAI { }) } + validateDuplicateBooksPayload(payload, books) { + const resultGroups = Array.isArray(payload?.groups) ? payload.groups : [] + const expectedIds = new Set(books.map((book) => book.id)) + const consumedIds = new Set() + const validated = [] + + resultGroups.forEach((group) => { + const keepId = this.normalizeOptionalString(group?.keepId, 120) + if (!keepId || !expectedIds.has(keepId)) { + Logger.warn(`[OpenAI] Ignoring duplicate-books group with invalid keepId "${group?.keepId}"`) + return + } + if (consumedIds.has(keepId)) { + Logger.warn(`[OpenAI] Ignoring duplicate-books group because keepId "${keepId}" was already used`) + return + } + + const duplicateIds = Array.isArray(group?.duplicateIds) + ? group.duplicateIds + .map((duplicateId) => this.normalizeOptionalString(duplicateId, 120)) + .filter((duplicateId) => duplicateId && expectedIds.has(duplicateId) && duplicateId !== keepId) + : [] + + const dedupedDuplicateIds = [] + const seenDuplicateIds = new Set() + duplicateIds.forEach((duplicateId) => { + if (seenDuplicateIds.has(duplicateId) || consumedIds.has(duplicateId)) return + seenDuplicateIds.add(duplicateId) + dedupedDuplicateIds.push(duplicateId) + }) + + if (!dedupedDuplicateIds.length) return + + consumedIds.add(keepId) + dedupedDuplicateIds.forEach((duplicateId) => consumedIds.add(duplicateId)) + validated.push({ + keepId, + duplicateIds: dedupedDuplicateIds, + reason: this.normalizeOptionalString(group?.reason, 600) || '' + }) + }) + + return validated + } + validateBookIds(resultBooks, books) { if (!Array.isArray(resultBooks) || resultBooks.length !== books.length) { throw new Error('OpenAI returned an invalid number of books') @@ -742,6 +795,93 @@ ${JSON.stringify(mediaFiles, null, 2)}` }) return validated } + + async detectDuplicateBooks(libraryItems) { + if (!this.isConfigured) { + throw new Error('OpenAI API key is not configured') + } + + const books = libraryItems.map((libraryItem) => { + const metadata = libraryItem.media.oldMetadataToJSON() + const folderContext = this.getFolderContext(libraryItem) + const metadataCompletenessScore = [ + metadata.title, + metadata.subtitle, + metadata.description, + metadata.isbn, + metadata.asin, + metadata.publisher, + metadata.language, + metadata.publishedYear, + metadata.authors?.length ? 'authors' : null, + metadata.series?.length ? 'series' : null, + metadata.narrators?.length ? 'narrators' : null, + libraryItem.media.coverPath ? 'cover' : null + ].filter(Boolean).length + + return { + id: libraryItem.id, + title: metadata.title || null, + subtitle: metadata.subtitle || null, + authors: (metadata.authors || []).map((author) => author.name), + narrators: metadata.narrators || [], + series: (metadata.series || []).map((series) => ({ name: series.name, sequence: series.sequence || null })), + publishedYear: metadata.publishedYear || null, + description: this.cleanDescription(metadata.description), + language: metadata.language || null, + abridged: !!metadata.abridged, + explicit: !!metadata.explicit, + isbn: metadata.isbn || null, + asin: metadata.asin || null, + duration: libraryItem.media.duration || null, + size: libraryItem.media.size || libraryItem.size || null, + numAudioFiles: libraryItem.media.audioFiles?.length || 0, + numChapters: libraryItem.media.chapters?.length || 0, + hasCover: !!libraryItem.media.coverPath, + ebookFormat: libraryItem.media.ebookFile?.ebookFormat || null, + isFile: !!libraryItem.isFile, + fullPath: folderContext.fullPath, + relPath: folderContext.relPath, + metadataCompletenessScore + } + }) + + Logger.info(`[OpenAI] Evaluating duplicate books for ${books.length} candidates`) + books.forEach((book) => { + Logger.info(`[OpenAI] Duplicate-books candidate ${JSON.stringify(book)}`) + }) + + const prompt = `You identify duplicate audiobook library items that represent the same underlying book/work and choose which copy to keep. + +Return only valid JSON in this shape: +{ + "groups": [ + { + "keepId": "library-item-id-to-keep", + "duplicateIds": ["library-item-id-to-remove"], + "reason": "brief reason" + } + ] +} + +Rules: +- Only mark items as duplicates if they are clearly the same book/work. +- Books in the same series are not duplicates unless they are the same title/work. +- Different abridged vs unabridged editions, different languages, dramatizations, companions, or supplemental books are not duplicates unless the evidence strongly indicates they are just duplicate copies. +- Prefer keeping the copy with richer metadata, cleaner path naming, cover art, more complete file data, and generally better organization. +- Do not include a group if no duplicates should be removed. +- Do not include the same id in more than one group. + +Books: +${JSON.stringify(books, null, 2)}` + + const payload = await this.createResponse(prompt) + const validated = this.validateDuplicateBooksPayload(payload, books) + validated.forEach((decision) => { + Logger.info(`[OpenAI] Duplicate-books result ${this.summarizeDuplicateDecisionForLog(decision)}`) + }) + return validated + } } module.exports = OpenAI diff --git a/server/routers/ApiRouter.js b/server/routers/ApiRouter.js index faa6ba2b..6002f975 100644 --- a/server/routers/ApiRouter.js +++ b/server/routers/ApiRouter.js @@ -90,6 +90,7 @@ class ApiRouter { this.router.get('/libraries/:id/matchall', LibraryController.middleware.bind(this), LibraryController.matchAll.bind(this)) this.router.post('/libraries/:id/scan', LibraryController.middleware.bind(this), LibraryController.scan.bind(this)) this.router.post('/libraries/:id/detect-series-with-ai', LibraryController.middleware.bind(this), LibraryController.detectSeriesWithAI.bind(this)) + this.router.post('/libraries/:id/dedupe-books-with-ai', LibraryController.middleware.bind(this), LibraryController.dedupeBooksWithAI.bind(this)) this.router.get('/libraries/:id/recent-episodes', LibraryController.middleware.bind(this), LibraryController.getRecentEpisodes.bind(this)) this.router.get('/libraries/:id/opml', LibraryController.middleware.bind(this), LibraryController.getOPMLFile.bind(this)) this.router.post('/libraries/order', LibraryController.reorder.bind(this)) diff --git a/test/server/providers/OpenAI.test.js b/test/server/providers/OpenAI.test.js index 53fed180..4e86a93b 100644 --- a/test/server/providers/OpenAI.test.js +++ b/test/server/providers/OpenAI.test.js @@ -212,4 +212,61 @@ describe('OpenAI', () => { expect(result[1].reason).to.contain('omitted this media file') }) }) + + describe('validateDuplicateBooksPayload', () => { + it('normalizes valid duplicate-book groups', () => { + const result = openAI.validateDuplicateBooksPayload( + { + groups: [ + { + keepId: 'a', + duplicateIds: ['b', 'c'], + reason: 'same book' + } + ] + }, + [{ id: 'a' }, { id: 'b' }, { id: 'c' }] + ) + + expect(result).to.deep.equal([ + { + keepId: 'a', + duplicateIds: ['b', 'c'], + reason: 'same book' + } + ]) + }) + + it('ignores invalid and overlapping duplicate-book groups', () => { + const result = openAI.validateDuplicateBooksPayload( + { + groups: [ + { + keepId: 'a', + duplicateIds: ['b', 'missing', 'a'], + reason: 'primary match' + }, + { + keepId: 'b', + duplicateIds: ['c'], + reason: 'should be skipped because b was consumed' + }, + { + keepId: 'z', + duplicateIds: ['c'] + } + ] + }, + [{ id: 'a' }, { id: 'b' }, { id: 'c' }] + ) + + expect(result).to.deep.equal([ + { + keepId: 'a', + duplicateIds: ['b'], + reason: 'primary match' + } + ]) + }) + }) })