mirror of
https://github.com/advplyr/audiobookshelf.git
synced 2026-05-12 22:41:29 +00:00
update
This commit is contained in:
parent
58776ca983
commit
b5e620b981
5 changed files with 419 additions and 0 deletions
|
|
@ -15,6 +15,20 @@
|
|||
<p v-if="!openAIConfigured" class="text-sm text-yellow-400 mt-3">Configure OpenAI first in server settings.</p>
|
||||
</div>
|
||||
|
||||
<div v-if="isBookLibrary" class="w-full border border-black-200 p-4 my-8">
|
||||
<div class="flex flex-wrap items-center">
|
||||
<div>
|
||||
<p class="text-lg">Dedupe Books With AI</p>
|
||||
<p class="max-w-sm text-sm pt-2 text-gray-300">Analyze likely duplicate books in this library with OpenAI, keep the best copy, and remove the duplicate items. This deletes duplicate files from disk.</p>
|
||||
</div>
|
||||
<div class="grow" />
|
||||
<div>
|
||||
<ui-btn :disabled="processing || !openAIConfigured" @click.stop="dedupeBooksWithAI">Dedupe Books</ui-btn>
|
||||
</div>
|
||||
</div>
|
||||
<p v-if="!openAIConfigured" class="text-sm text-yellow-400 mt-3">Configure OpenAI first in server settings.</p>
|
||||
</div>
|
||||
|
||||
<div class="w-full border border-black-200 p-4 my-8">
|
||||
<div class="flex flex-wrap items-center">
|
||||
<div>
|
||||
|
|
@ -83,6 +97,18 @@ export default {
|
|||
}
|
||||
this.$store.commit('globals/setConfirmPrompt', payload)
|
||||
},
|
||||
dedupeBooksWithAI() {
|
||||
const payload = {
|
||||
message: 'Deduplicate books in this library with AI? Duplicate items chosen for removal will be deleted from the database and file system.',
|
||||
callback: (confirmed) => {
|
||||
if (confirmed) {
|
||||
this.runBookDedupe()
|
||||
}
|
||||
},
|
||||
type: 'yesNo'
|
||||
}
|
||||
this.$store.commit('globals/setConfirmPrompt', payload)
|
||||
},
|
||||
runSeriesDetection(onlyMissing = true) {
|
||||
this.$emit('update:processing', true)
|
||||
this.$axios
|
||||
|
|
@ -102,6 +128,25 @@ export default {
|
|||
this.$emit('update:processing', false)
|
||||
})
|
||||
},
|
||||
runBookDedupe() {
|
||||
this.$emit('update:processing', true)
|
||||
this.$axios
|
||||
.$post(`/api/libraries/${this.libraryId}/dedupe-books-with-ai?hard=1`)
|
||||
.then((data) => {
|
||||
if (!data.duplicatesRemoved) {
|
||||
this.$toast.info(this.$strings.ToastNoUpdatesNecessary)
|
||||
} else {
|
||||
this.$toast.success(`AI removed ${data.duplicatesRemoved} duplicate books`)
|
||||
}
|
||||
})
|
||||
.catch((error) => {
|
||||
console.error('Failed to dedupe books with AI', error)
|
||||
this.$toast.error(error.response?.data || this.$strings.ToastFailedToUpdate)
|
||||
})
|
||||
.finally(() => {
|
||||
this.$emit('update:processing', false)
|
||||
})
|
||||
},
|
||||
removeAllMetadataClick(ext) {
|
||||
const payload = {
|
||||
message: this.$getString('MessageConfirmRemoveMetadataFiles', [ext]),
|
||||
|
|
|
|||
|
|
@ -1565,6 +1565,97 @@ class LibraryController {
|
|||
return [...groups.values()]
|
||||
}
|
||||
|
||||
normalizeBookTitleForAIDedupe(title) {
|
||||
if (!title || typeof title !== 'string') return null
|
||||
return title
|
||||
.toLowerCase()
|
||||
.replace(/\([^)]*\)/g, ' ')
|
||||
.replace(/\[[^\]]*]/g, ' ')
|
||||
.replace(/\b(unabridged|abridged|audiobook|audio book)\b/g, ' ')
|
||||
.replace(/[^a-z0-9]+/g, ' ')
|
||||
.replace(/\s+/g, ' ')
|
||||
.trim()
|
||||
}
|
||||
|
||||
groupLibraryBooksForAIDedupe(libraryItems) {
|
||||
const parent = new Map()
|
||||
const find = (id) => {
|
||||
if (parent.get(id) !== id) {
|
||||
parent.set(id, find(parent.get(id)))
|
||||
}
|
||||
return parent.get(id)
|
||||
}
|
||||
const union = (a, b) => {
|
||||
const rootA = find(a)
|
||||
const rootB = find(b)
|
||||
if (rootA !== rootB) parent.set(rootB, rootA)
|
||||
}
|
||||
|
||||
libraryItems.forEach((libraryItem) => parent.set(libraryItem.id, libraryItem.id))
|
||||
|
||||
const candidateMaps = [new Map(), new Map(), new Map()]
|
||||
libraryItems.forEach((libraryItem) => {
|
||||
const metadata = libraryItem.media.oldMetadataToJSON()
|
||||
const primaryAuthor = metadata.authors?.[0]?.name?.trim().toLowerCase() || null
|
||||
const normalizedTitle = LibraryController.prototype.normalizeBookTitleForAIDedupe.call(this, metadata.title || '')
|
||||
const isbn = metadata.isbn?.replace(/[-\s]/g, '').toLowerCase() || null
|
||||
const asin = metadata.asin?.trim().toLowerCase() || null
|
||||
|
||||
const candidateKeys = []
|
||||
if (primaryAuthor && normalizedTitle) candidateKeys.push([candidateMaps[0], `${primaryAuthor}::${normalizedTitle}`])
|
||||
if (isbn) candidateKeys.push([candidateMaps[1], `isbn::${isbn}`])
|
||||
if (asin) candidateKeys.push([candidateMaps[2], `asin::${asin}`])
|
||||
|
||||
candidateKeys.forEach(([candidateMap, key]) => {
|
||||
if (!candidateMap.has(key)) candidateMap.set(key, [])
|
||||
candidateMap.get(key).push(libraryItem)
|
||||
})
|
||||
})
|
||||
|
||||
candidateMaps.forEach((candidateMap) => {
|
||||
candidateMap.forEach((groupItems) => {
|
||||
if (groupItems.length < 2) return
|
||||
for (let i = 1; i < groupItems.length; i++) {
|
||||
union(groupItems[0].id, groupItems[i].id)
|
||||
}
|
||||
})
|
||||
})
|
||||
|
||||
const grouped = new Map()
|
||||
libraryItems.forEach((libraryItem) => {
|
||||
const root = find(libraryItem.id)
|
||||
if (!grouped.has(root)) grouped.set(root, [])
|
||||
grouped.get(root).push(libraryItem)
|
||||
})
|
||||
|
||||
return [...grouped.values()]
|
||||
.filter((groupItems) => groupItems.length > 1)
|
||||
.map((groupItems) => ({
|
||||
label: groupItems.map((libraryItem) => libraryItem.media.title).join(' | '),
|
||||
libraryItems: groupItems.sort((a, b) => a.media.title.localeCompare(b.media.title))
|
||||
}))
|
||||
}
|
||||
|
||||
getDeleteDependenciesForLibraryItem(libraryItem) {
|
||||
const mediaItemIds = []
|
||||
const authorIds = []
|
||||
const seriesIds = []
|
||||
|
||||
mediaItemIds.push(libraryItem.media.id)
|
||||
if (libraryItem.media.authors?.length) {
|
||||
authorIds.push(...libraryItem.media.authors.map((author) => author.id))
|
||||
}
|
||||
if (libraryItem.media.series?.length) {
|
||||
seriesIds.push(...libraryItem.media.series.map((series) => series.id))
|
||||
}
|
||||
|
||||
return {
|
||||
mediaItemIds,
|
||||
authorIds,
|
||||
seriesIds
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* POST: /api/libraries/:id/detect-series-with-ai
|
||||
*
|
||||
|
|
@ -1706,6 +1797,91 @@ class LibraryController {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* POST: /api/libraries/:id/dedupe-books-with-ai
|
||||
*
|
||||
* @this {import('../routers/ApiRouter')}
|
||||
*
|
||||
* @param {LibraryControllerRequest} req
|
||||
* @param {Response} res
|
||||
*/
|
||||
async dedupeBooksWithAI(req, res) {
|
||||
if (!req.user.canUpdate) {
|
||||
Logger.warn(`[LibraryController] User "${req.user.username}" attempted AI dedupe without update permissions`)
|
||||
return res.sendStatus(403)
|
||||
}
|
||||
if (req.library.mediaType !== 'book') {
|
||||
return res.status(400).send('AI book dedupe is only available for book libraries')
|
||||
}
|
||||
if (!openAI.isConfigured) {
|
||||
return res.status(400).send('OpenAI is not configured')
|
||||
}
|
||||
|
||||
const hardDelete = req.query.hard !== '0'
|
||||
|
||||
try {
|
||||
const libraryItems = await LibraryController.prototype.getLibraryBooksForAISeriesDetection.call(this, req.library.id)
|
||||
const candidateGroups = LibraryController.prototype.groupLibraryBooksForAIDedupe.call(this, libraryItems)
|
||||
|
||||
let groupsProcessed = 0
|
||||
let duplicatesRemoved = 0
|
||||
const removedIds = new Set()
|
||||
const authorIdsToCheck = new Set()
|
||||
const seriesIdsToCheck = new Set()
|
||||
|
||||
for (const candidateGroup of candidateGroups) {
|
||||
const activeLibraryItems = candidateGroup.libraryItems.filter((libraryItem) => !removedIds.has(libraryItem.id))
|
||||
if (activeLibraryItems.length < 2) continue
|
||||
|
||||
Logger.info(`[LibraryController] AI dedupe evaluating candidate group "${candidateGroup.label}" with ${activeLibraryItems.length} books`)
|
||||
const decisions = await openAI.detectDuplicateBooks(activeLibraryItems)
|
||||
groupsProcessed++
|
||||
|
||||
for (const decision of decisions) {
|
||||
for (const duplicateId of decision.duplicateIds) {
|
||||
if (removedIds.has(duplicateId) || duplicateId === decision.keepId) continue
|
||||
const duplicateItem = activeLibraryItems.find((libraryItem) => libraryItem.id === duplicateId)
|
||||
if (!duplicateItem) continue
|
||||
|
||||
Logger.info(
|
||||
`[LibraryController] AI dedupe removing duplicate "${duplicateItem.media.title}" (${duplicateItem.id}) keeping "${decision.keepId}" reason="${decision.reason || ''}"`
|
||||
)
|
||||
|
||||
const deleteDependencies = LibraryController.prototype.getDeleteDependenciesForLibraryItem.call(this, duplicateItem)
|
||||
await this.handleDeleteLibraryItem(duplicateItem.id, deleteDependencies.mediaItemIds, req.library.id)
|
||||
if (hardDelete) {
|
||||
await fs.remove(duplicateItem.path).catch((error) => {
|
||||
Logger.error(`[LibraryController] Failed to hard-delete duplicate item path "${duplicateItem.path}"`, error)
|
||||
})
|
||||
}
|
||||
|
||||
deleteDependencies.authorIds.forEach((authorId) => authorIdsToCheck.add(authorId))
|
||||
deleteDependencies.seriesIds.forEach((seriesId) => seriesIdsToCheck.add(seriesId))
|
||||
removedIds.add(duplicateItem.id)
|
||||
duplicatesRemoved++
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
await this.checkRemoveAuthorsWithNoBooks([...authorIdsToCheck])
|
||||
await this.checkRemoveEmptySeries([...seriesIdsToCheck])
|
||||
await Database.resetLibraryIssuesFilterData(req.library.id)
|
||||
|
||||
Logger.info(
|
||||
`[LibraryController] AI book dedupe completed for library "${req.library.name}" - groupsProcessed=${groupsProcessed}, duplicatesRemoved=${duplicatesRemoved}, hardDelete=${hardDelete}`
|
||||
)
|
||||
|
||||
res.json({
|
||||
groupsProcessed,
|
||||
duplicatesRemoved,
|
||||
hardDelete
|
||||
})
|
||||
} catch (error) {
|
||||
Logger.error(`[LibraryController] Failed AI dedupe for library "${req.library.name}"`, error)
|
||||
res.status(500).send(error.message || 'Failed to dedupe books with AI')
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param {RequestWithUser} req
|
||||
|
|
|
|||
|
|
@ -50,6 +50,14 @@ class OpenAI {
|
|||
})
|
||||
}
|
||||
|
||||
summarizeDuplicateDecisionForLog(decision) {
|
||||
return JSON.stringify({
|
||||
keepId: decision.keepId,
|
||||
duplicateIds: decision.duplicateIds,
|
||||
reason: decision.reason || ''
|
||||
})
|
||||
}
|
||||
|
||||
normalizePathForPrompt(filePath) {
|
||||
if (!filePath || typeof filePath !== 'string') return null
|
||||
return filePath.replace(/\\/g, '/')
|
||||
|
|
@ -316,6 +324,51 @@ class OpenAI {
|
|||
})
|
||||
}
|
||||
|
||||
validateDuplicateBooksPayload(payload, books) {
|
||||
const resultGroups = Array.isArray(payload?.groups) ? payload.groups : []
|
||||
const expectedIds = new Set(books.map((book) => book.id))
|
||||
const consumedIds = new Set()
|
||||
const validated = []
|
||||
|
||||
resultGroups.forEach((group) => {
|
||||
const keepId = this.normalizeOptionalString(group?.keepId, 120)
|
||||
if (!keepId || !expectedIds.has(keepId)) {
|
||||
Logger.warn(`[OpenAI] Ignoring duplicate-books group with invalid keepId "${group?.keepId}"`)
|
||||
return
|
||||
}
|
||||
if (consumedIds.has(keepId)) {
|
||||
Logger.warn(`[OpenAI] Ignoring duplicate-books group because keepId "${keepId}" was already used`)
|
||||
return
|
||||
}
|
||||
|
||||
const duplicateIds = Array.isArray(group?.duplicateIds)
|
||||
? group.duplicateIds
|
||||
.map((duplicateId) => this.normalizeOptionalString(duplicateId, 120))
|
||||
.filter((duplicateId) => duplicateId && expectedIds.has(duplicateId) && duplicateId !== keepId)
|
||||
: []
|
||||
|
||||
const dedupedDuplicateIds = []
|
||||
const seenDuplicateIds = new Set()
|
||||
duplicateIds.forEach((duplicateId) => {
|
||||
if (seenDuplicateIds.has(duplicateId) || consumedIds.has(duplicateId)) return
|
||||
seenDuplicateIds.add(duplicateId)
|
||||
dedupedDuplicateIds.push(duplicateId)
|
||||
})
|
||||
|
||||
if (!dedupedDuplicateIds.length) return
|
||||
|
||||
consumedIds.add(keepId)
|
||||
dedupedDuplicateIds.forEach((duplicateId) => consumedIds.add(duplicateId))
|
||||
validated.push({
|
||||
keepId,
|
||||
duplicateIds: dedupedDuplicateIds,
|
||||
reason: this.normalizeOptionalString(group?.reason, 600) || ''
|
||||
})
|
||||
})
|
||||
|
||||
return validated
|
||||
}
|
||||
|
||||
validateBookIds(resultBooks, books) {
|
||||
if (!Array.isArray(resultBooks) || resultBooks.length !== books.length) {
|
||||
throw new Error('OpenAI returned an invalid number of books')
|
||||
|
|
@ -742,6 +795,93 @@ ${JSON.stringify(mediaFiles, null, 2)}`
|
|||
})
|
||||
return validated
|
||||
}
|
||||
|
||||
async detectDuplicateBooks(libraryItems) {
|
||||
if (!this.isConfigured) {
|
||||
throw new Error('OpenAI API key is not configured')
|
||||
}
|
||||
|
||||
const books = libraryItems.map((libraryItem) => {
|
||||
const metadata = libraryItem.media.oldMetadataToJSON()
|
||||
const folderContext = this.getFolderContext(libraryItem)
|
||||
const metadataCompletenessScore = [
|
||||
metadata.title,
|
||||
metadata.subtitle,
|
||||
metadata.description,
|
||||
metadata.isbn,
|
||||
metadata.asin,
|
||||
metadata.publisher,
|
||||
metadata.language,
|
||||
metadata.publishedYear,
|
||||
metadata.authors?.length ? 'authors' : null,
|
||||
metadata.series?.length ? 'series' : null,
|
||||
metadata.narrators?.length ? 'narrators' : null,
|
||||
libraryItem.media.coverPath ? 'cover' : null
|
||||
].filter(Boolean).length
|
||||
|
||||
return {
|
||||
id: libraryItem.id,
|
||||
title: metadata.title || null,
|
||||
subtitle: metadata.subtitle || null,
|
||||
authors: (metadata.authors || []).map((author) => author.name),
|
||||
narrators: metadata.narrators || [],
|
||||
series: (metadata.series || []).map((series) => ({ name: series.name, sequence: series.sequence || null })),
|
||||
publishedYear: metadata.publishedYear || null,
|
||||
description: this.cleanDescription(metadata.description),
|
||||
language: metadata.language || null,
|
||||
abridged: !!metadata.abridged,
|
||||
explicit: !!metadata.explicit,
|
||||
isbn: metadata.isbn || null,
|
||||
asin: metadata.asin || null,
|
||||
duration: libraryItem.media.duration || null,
|
||||
size: libraryItem.media.size || libraryItem.size || null,
|
||||
numAudioFiles: libraryItem.media.audioFiles?.length || 0,
|
||||
numChapters: libraryItem.media.chapters?.length || 0,
|
||||
hasCover: !!libraryItem.media.coverPath,
|
||||
ebookFormat: libraryItem.media.ebookFile?.ebookFormat || null,
|
||||
isFile: !!libraryItem.isFile,
|
||||
fullPath: folderContext.fullPath,
|
||||
relPath: folderContext.relPath,
|
||||
metadataCompletenessScore
|
||||
}
|
||||
})
|
||||
|
||||
Logger.info(`[OpenAI] Evaluating duplicate books for ${books.length} candidates`)
|
||||
books.forEach((book) => {
|
||||
Logger.info(`[OpenAI] Duplicate-books candidate ${JSON.stringify(book)}`)
|
||||
})
|
||||
|
||||
const prompt = `You identify duplicate audiobook library items that represent the same underlying book/work and choose which copy to keep.
|
||||
|
||||
Return only valid JSON in this shape:
|
||||
{
|
||||
"groups": [
|
||||
{
|
||||
"keepId": "library-item-id-to-keep",
|
||||
"duplicateIds": ["library-item-id-to-remove"],
|
||||
"reason": "brief reason"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
Rules:
|
||||
- Only mark items as duplicates if they are clearly the same book/work.
|
||||
- Books in the same series are not duplicates unless they are the same title/work.
|
||||
- Different abridged vs unabridged editions, different languages, dramatizations, companions, or supplemental books are not duplicates unless the evidence strongly indicates they are just duplicate copies.
|
||||
- Prefer keeping the copy with richer metadata, cleaner path naming, cover art, more complete file data, and generally better organization.
|
||||
- Do not include a group if no duplicates should be removed.
|
||||
- Do not include the same id in more than one group.
|
||||
|
||||
Books:
|
||||
${JSON.stringify(books, null, 2)}`
|
||||
|
||||
const payload = await this.createResponse(prompt)
|
||||
const validated = this.validateDuplicateBooksPayload(payload, books)
|
||||
validated.forEach((decision) => {
|
||||
Logger.info(`[OpenAI] Duplicate-books result ${this.summarizeDuplicateDecisionForLog(decision)}`)
|
||||
})
|
||||
return validated
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = OpenAI
|
||||
|
|
|
|||
|
|
@ -90,6 +90,7 @@ class ApiRouter {
|
|||
this.router.get('/libraries/:id/matchall', LibraryController.middleware.bind(this), LibraryController.matchAll.bind(this))
|
||||
this.router.post('/libraries/:id/scan', LibraryController.middleware.bind(this), LibraryController.scan.bind(this))
|
||||
this.router.post('/libraries/:id/detect-series-with-ai', LibraryController.middleware.bind(this), LibraryController.detectSeriesWithAI.bind(this))
|
||||
this.router.post('/libraries/:id/dedupe-books-with-ai', LibraryController.middleware.bind(this), LibraryController.dedupeBooksWithAI.bind(this))
|
||||
this.router.get('/libraries/:id/recent-episodes', LibraryController.middleware.bind(this), LibraryController.getRecentEpisodes.bind(this))
|
||||
this.router.get('/libraries/:id/opml', LibraryController.middleware.bind(this), LibraryController.getOPMLFile.bind(this))
|
||||
this.router.post('/libraries/order', LibraryController.reorder.bind(this))
|
||||
|
|
|
|||
|
|
@ -212,4 +212,61 @@ describe('OpenAI', () => {
|
|||
expect(result[1].reason).to.contain('omitted this media file')
|
||||
})
|
||||
})
|
||||
|
||||
describe('validateDuplicateBooksPayload', () => {
|
||||
it('normalizes valid duplicate-book groups', () => {
|
||||
const result = openAI.validateDuplicateBooksPayload(
|
||||
{
|
||||
groups: [
|
||||
{
|
||||
keepId: 'a',
|
||||
duplicateIds: ['b', 'c'],
|
||||
reason: 'same book'
|
||||
}
|
||||
]
|
||||
},
|
||||
[{ id: 'a' }, { id: 'b' }, { id: 'c' }]
|
||||
)
|
||||
|
||||
expect(result).to.deep.equal([
|
||||
{
|
||||
keepId: 'a',
|
||||
duplicateIds: ['b', 'c'],
|
||||
reason: 'same book'
|
||||
}
|
||||
])
|
||||
})
|
||||
|
||||
it('ignores invalid and overlapping duplicate-book groups', () => {
|
||||
const result = openAI.validateDuplicateBooksPayload(
|
||||
{
|
||||
groups: [
|
||||
{
|
||||
keepId: 'a',
|
||||
duplicateIds: ['b', 'missing', 'a'],
|
||||
reason: 'primary match'
|
||||
},
|
||||
{
|
||||
keepId: 'b',
|
||||
duplicateIds: ['c'],
|
||||
reason: 'should be skipped because b was consumed'
|
||||
},
|
||||
{
|
||||
keepId: 'z',
|
||||
duplicateIds: ['c']
|
||||
}
|
||||
]
|
||||
},
|
||||
[{ id: 'a' }, { id: 'b' }, { id: 'c' }]
|
||||
)
|
||||
|
||||
expect(result).to.deep.equal([
|
||||
{
|
||||
keepId: 'a',
|
||||
duplicateIds: ['b'],
|
||||
reason: 'primary match'
|
||||
}
|
||||
])
|
||||
})
|
||||
})
|
||||
})
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue