feat: implement duplicate title normalized filter

This commit is contained in:
Tiberiu Ichim 2026-02-22 16:46:14 +02:00
parent aa85106681
commit ead215e777
13 changed files with 276 additions and 1 deletions

View file

@ -0,0 +1,159 @@
const util = require('util')
const { getNormalizedTitle } = require('../utils')
/**
* @typedef MigrationContext
* @property {import('sequelize').QueryInterface} queryInterface - a suquelize QueryInterface object.
* @property {import('../Logger')} logger - a Logger object.
*
* @typedef MigrationOptions
* @property {MigrationContext} context - an object containing the migration context.
*/
const migrationVersion = '2.32.9'
const migrationName = `${migrationVersion}-add-title-normalized-columns`
const loggerPrefix = `[${migrationVersion} migration]`
async function up({ context: { queryInterface, logger } }) {
logger.info(`${loggerPrefix} UPGRADE BEGIN: ${migrationName}`)
// 1. Add columns
await addColumn(queryInterface, logger, 'libraryItems', 'titleNormalized', { type: queryInterface.sequelize.Sequelize.STRING, allowNull: true })
await addColumn(queryInterface, logger, 'books', 'titleNormalized', { type: queryInterface.sequelize.Sequelize.STRING, allowNull: true })
await addColumn(queryInterface, logger, 'podcasts', 'titleNormalized', { type: queryInterface.sequelize.Sequelize.STRING, allowNull: true })
// 2. Backfill data for books synchronously
logger.info(`${loggerPrefix} Backfilling titleNormalized for books`)
const books = await queryInterface.sequelize.query('SELECT id, title FROM books', { type: queryInterface.sequelize.QueryTypes.SELECT })
for (const book of books) {
if (book.title) {
const titleNormalized = getNormalizedTitle(book.title)
await queryInterface.sequelize.query('UPDATE books SET titleNormalized = :titleNormalized WHERE id = :id', {
replacements: { titleNormalized, id: book.id }
})
}
}
// Backfill data for podcasts
logger.info(`${loggerPrefix} Backfilling titleNormalized for podcasts`)
const podcasts = await queryInterface.sequelize.query('SELECT id, title FROM podcasts', { type: queryInterface.sequelize.QueryTypes.SELECT })
for (const podcast of podcasts) {
if (podcast.title) {
const titleNormalized = getNormalizedTitle(podcast.title)
await queryInterface.sequelize.query('UPDATE podcasts SET titleNormalized = :titleNormalized WHERE id = :id', {
replacements: { titleNormalized, id: podcast.id }
})
}
}
// 3. Copy from books/podcasts to libraryItems
await copyColumn(queryInterface, logger, 'books', 'titleNormalized', 'id', 'libraryItems', 'titleNormalized', 'mediaId')
await copyColumn(queryInterface, logger, 'podcasts', 'titleNormalized', 'id', 'libraryItems', 'titleNormalized', 'mediaId')
// 4. Add triggers
await addTrigger(queryInterface, logger, 'books', 'titleNormalized', 'id', 'libraryItems', 'titleNormalized', 'mediaId')
await addTrigger(queryInterface, logger, 'podcasts', 'titleNormalized', 'id', 'libraryItems', 'titleNormalized', 'mediaId')
// 5. Add index on libraryItems
await addIndex(queryInterface, logger, 'libraryItems', ['libraryId', 'mediaType', { name: 'titleNormalized', collate: 'NOCASE' }])
logger.info(`${loggerPrefix} UPGRADE END: ${migrationName}`)
}
async function down({ context: { queryInterface, logger } }) {
logger.info(`${loggerPrefix} DOWNGRADE BEGIN: ${migrationName}`)
await removeIndex(queryInterface, logger, 'libraryItems', ['libraryId', 'mediaType', 'titleNormalized'])
await removeTrigger(queryInterface, logger, 'libraryItems', 'titleNormalized', 'books')
await removeTrigger(queryInterface, logger, 'libraryItems', 'titleNormalized', 'podcasts')
await removeColumn(queryInterface, logger, 'libraryItems', 'titleNormalized')
await removeColumn(queryInterface, logger, 'books', 'titleNormalized')
await removeColumn(queryInterface, logger, 'podcasts', 'titleNormalized')
logger.info(`${loggerPrefix} DOWNGRADE END: ${migrationName}`)
}
async function addIndex(queryInterface, logger, tableName, columns) {
const columnString = columns.map((column) => util.inspect(column)).join(', ')
const indexName = convertToSnakeCase(`${tableName}_${columns.map((column) => (typeof column === 'string' ? column : column.name)).join('_')}`)
try {
logger.info(`${loggerPrefix} adding index on [${columnString}] to table ${tableName}. index name: ${indexName}"`)
await queryInterface.addIndex(tableName, columns)
logger.info(`${loggerPrefix} added index on [${columnString}] to table ${tableName}. index name: ${indexName}"`)
} catch (error) {
if (error.name === 'SequelizeDatabaseError' && error.message.includes('already exists')) {
logger.info(`${loggerPrefix} index [${columnString}] for table "${tableName}" already exists`)
} else {
throw error
}
}
}
async function removeIndex(queryInterface, logger, tableName, columns) {
logger.info(`${loggerPrefix} removing index [${columns.join(', ')}] from table "${tableName}"`)
try {
await queryInterface.removeIndex(tableName, columns)
logger.info(`${loggerPrefix} removed index [${columns.join(', ')}] from table "${tableName}"`)
} catch (error) {}
}
async function addColumn(queryInterface, logger, table, column, options) {
logger.info(`${loggerPrefix} adding column "${column}" to table "${table}"`)
const tableDescription = await queryInterface.describeTable(table)
if (!tableDescription[column]) {
await queryInterface.addColumn(table, column, options)
logger.info(`${loggerPrefix} added column "${column}" to table "${table}"`)
} else {
logger.info(`${loggerPrefix} column "${column}" already exists in table "${table}"`)
}
}
async function removeColumn(queryInterface, logger, table, column) {
logger.info(`${loggerPrefix} removing column "${column}" from table "${table}"`)
await queryInterface.removeColumn(table, column)
logger.info(`${loggerPrefix} removed column "${column}" from table "${table}"`)
}
async function copyColumn(queryInterface, logger, sourceTable, sourceColumn, sourceIdColumn, targetTable, targetColumn, targetIdColumn) {
logger.info(`${loggerPrefix} copying column "${sourceColumn}" from table "${sourceTable}" to table "${targetTable}"`)
await queryInterface.sequelize.query(`
UPDATE ${targetTable}
SET ${targetColumn} = ${sourceTable}.${sourceColumn}
FROM ${sourceTable}
WHERE ${targetTable}.${targetIdColumn} = ${sourceTable}.${sourceIdColumn}
`)
logger.info(`${loggerPrefix} copied column "${sourceColumn}" from table "${sourceTable}" to table "${targetTable}"`)
}
async function addTrigger(queryInterface, logger, sourceTable, sourceColumn, sourceIdColumn, targetTable, targetColumn, targetIdColumn) {
logger.info(`${loggerPrefix} adding trigger to update ${targetTable}.${targetColumn} when ${sourceTable}.${sourceColumn} is updated`)
const triggerName = convertToSnakeCase(`update_${targetTable}_${targetColumn}_from_${sourceTable}`)
await queryInterface.sequelize.query(`DROP TRIGGER IF EXISTS ${triggerName}`)
await queryInterface.sequelize.query(`
CREATE TRIGGER ${triggerName}
AFTER UPDATE OF ${sourceColumn} ON ${sourceTable}
FOR EACH ROW
BEGIN
UPDATE ${targetTable}
SET ${targetColumn} = NEW.${sourceColumn}
WHERE ${targetTable}.${targetIdColumn} = NEW.${sourceIdColumn};
END;
`)
logger.info(`${loggerPrefix} added trigger.`)
}
async function removeTrigger(queryInterface, logger, targetTable, targetColumn, sourceTable) {
logger.info(`${loggerPrefix} removing trigger`)
const triggerName = convertToSnakeCase(`update_${targetTable}_${targetColumn}_from_${sourceTable}`)
await queryInterface.sequelize.query(`DROP TRIGGER IF EXISTS ${triggerName}`)
}
function convertToSnakeCase(str) {
return str.replace(/([A-Z])/g, '_$1').toLowerCase()
}
module.exports = { up, down }

View file

@ -146,6 +146,7 @@ class Book extends Model {
},
title: DataTypes.STRING,
titleIgnorePrefix: DataTypes.STRING,
titleNormalized: DataTypes.STRING,
subtitle: DataTypes.STRING,
publishedYear: DataTypes.STRING,
publishedDate: DataTypes.STRING,
@ -407,7 +408,9 @@ class Book extends Model {
this[key] = payload.metadata[key] || null
if (key === 'title') {
const { getTitleIgnorePrefix, getNormalizedTitle } = require('../utils')
this.titleIgnorePrefix = getTitleIgnorePrefix(this.title)
this.titleNormalized = getNormalizedTitle(this.title)
}
hasUpdates = true

View file

@ -78,6 +78,8 @@ class LibraryItem extends Model {
/** @type {string} */
this.titleIgnorePrefix // Only used for sorting
/** @type {string} */
this.titleNormalized // Only used for sorting
/** @type {string} */
this.authorNamesFirstLast // Only used for sorting
/** @type {string} */
this.authorNamesLastFirst // Only used for sorting
@ -687,6 +689,7 @@ class LibraryItem extends Model {
extraData: DataTypes.JSON,
title: DataTypes.STRING,
titleIgnorePrefix: DataTypes.STRING,
titleNormalized: DataTypes.STRING,
authorNamesFirstLast: DataTypes.STRING,
authorNamesLastFirst: DataTypes.STRING,
isNotConsolidated: {
@ -719,6 +722,9 @@ class LibraryItem extends Model {
{
fields: ['libraryId', 'mediaType', { name: 'titleIgnorePrefix', collate: 'NOCASE' }]
},
{
fields: ['libraryId', 'mediaType', { name: 'titleNormalized', collate: 'NOCASE' }]
},
{
fields: ['libraryId', 'mediaType', { name: 'authorNamesFirstLast', collate: 'NOCASE' }]
},
@ -795,6 +801,7 @@ class LibraryItem extends Model {
if (instance.media) {
instance.title = instance.media.title
instance.titleIgnorePrefix = instance.media.titleIgnorePrefix
instance.titleNormalized = instance.media.titleNormalized
if (instance.isBook) {
if (instance.media.authors !== undefined) {
instance.authorNamesFirstLast = instance.media.authorName

View file

@ -1,5 +1,5 @@
const { DataTypes, Model } = require('sequelize')
const { getTitlePrefixAtEnd, getTitleIgnorePrefix } = require('../utils')
const { getTitlePrefixAtEnd, getTitleIgnorePrefix, getNormalizedTitle } = require('../utils')
const Logger = require('../Logger')
const libraryItemsPodcastFilters = require('../utils/queries/libraryItemsPodcastFilters')
const htmlSanitizer = require('../utils/htmlSanitizer')
@ -93,6 +93,7 @@ class Podcast extends Model {
{
title,
titleIgnorePrefix: getTitleIgnorePrefix(title),
titleNormalized: getNormalizedTitle(title),
author: typeof payload.metadata.author === 'string' ? payload.metadata.author : null,
releaseDate: typeof payload.metadata.releaseDate === 'string' ? payload.metadata.releaseDate : null,
feedURL: typeof payload.metadata.feedUrl === 'string' ? payload.metadata.feedUrl : null,
@ -130,6 +131,7 @@ class Podcast extends Model {
},
title: DataTypes.STRING,
titleIgnorePrefix: DataTypes.STRING,
titleNormalized: DataTypes.STRING,
author: DataTypes.STRING,
releaseDate: DataTypes.STRING,
feedURL: DataTypes.STRING,
@ -257,6 +259,7 @@ class Podcast extends Model {
if (key === 'title') {
this.titleIgnorePrefix = getTitleIgnorePrefix(this.title)
this.titleNormalized = getNormalizedTitle(this.title)
}
hasUpdates = true

View file

@ -191,6 +191,18 @@ module.exports.getTitleIgnorePrefix = (title) => {
return getTitleParts(title)[0]
}
/**
* Get normalized title to use for grouping duplicates
* Removes non-alphabetic characters (numbers, punctuation, spaces)
* @param {string} title
* @returns {string}
*/
module.exports.getNormalizedTitle = (title) => {
if (!title) return ''
const sortTitle = getTitleParts(title)[0] || title
return sortTitle.toLowerCase().replace(/[^\p{L}]/gu, '')
}
/**
* Put sorting prefix at the end of title
* @example "The Good Book" => "Good Book, The"

View file

@ -515,6 +515,10 @@ module.exports = {
isInvalid: true
}
]
} else if (filterGroup === 'duplicates') {
libraryItemWhere['titleNormalized'] = {
[Sequelize.Op.in]: Sequelize.literal(`(SELECT titleNormalized FROM libraryItems WHERE libraryId = '${libraryId}' AND titleNormalized IS NOT NULL AND titleNormalized != '' GROUP BY titleNormalized HAVING COUNT(titleNormalized) > 1)`)
}
} else if (filterGroup === 'progress' && user) {
const mediaProgressWhere = {
userId: user.id

View file

@ -168,6 +168,10 @@ module.exports = {
isInvalid: true
}
]
} else if (filterGroup === 'duplicates') {
libraryItemWhere['titleNormalized'] = {
[Sequelize.Op.in]: Sequelize.literal(`(SELECT titleNormalized FROM libraryItems WHERE libraryId = '${libraryId}' AND titleNormalized IS NOT NULL AND titleNormalized != '' GROUP BY titleNormalized HAVING COUNT(titleNormalized) > 1)`)
}
} else if (filterGroup === 'recent') {
libraryItemWhere['createdAt'] = {
[Sequelize.Op.gte]: new Date(new Date() - 60 * 24 * 60 * 60 * 1000) // 60 days ago