From 4dbd92ac4d5d806f4f37275d66ad9a3bbbbb7d21 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20B=C3=B6hmer?= Date: Sun, 26 Apr 2026 19:36:03 +0200 Subject: [PATCH 1/7] Use markdown as input for the LLM and add extracted microdata separatley --- composer.json | 7 + composer.lock | 466 +++++++++++++++--- .../DTOJsonSchemaConverter.php | 14 +- .../Providers/AIInfoExtractor.php | 85 ++-- symfony.lock | 6 - 5 files changed, 481 insertions(+), 97 deletions(-) diff --git a/composer.json b/composer.json index c7c52d5c..c4b3cb59 100644 --- a/composer.json +++ b/composer.json @@ -33,6 +33,7 @@ "jbtronics/dompdf-font-loader-bundle": "^1.0.0", "jbtronics/settings-bundle": "^3.0.0", "jfcherng/php-diff": "^6.14", + "jkphl/micrometa": "dev-master", "knpuniversity/oauth2-client-bundle": "^2.15", "league/commonmark": "^2.7", "league/csv": "^9.8.0", @@ -159,6 +160,12 @@ "App\\Tests\\": "tests/" } }, + "repositories": [ + { + "type": "vcs", + "url": "https://github.com/jbtronics/micrometa" + } + ], "scripts": { "auto-scripts": { "cache:clear": "symfony-cmd", diff --git a/composer.lock b/composer.lock index 00b5e831..1963342f 100644 --- a/composer.lock +++ b/composer.lock @@ -4,7 +4,7 @@ "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies", "This file is @generated automatically" ], - "content-hash": "699f421ad81f8a1acacf8e2c4af66491", + "content-hash": "7c76e3af5fd042105a3208fdcb300a11", "packages": [ { "name": "amphp/amp", @@ -3883,16 +3883,16 @@ }, { "name": "doctrine/migrations", - "version": "3.9.6", + "version": "3.9.7", "source": { "type": "git", "url": "https://github.com/doctrine/migrations.git", - "reference": "ffd8355cdd8505fc650d9604f058bf62aedd80a1" + "reference": "96cb2a89b56c9efb0bac38e606dc0b0f13e650ec" }, "dist": { "type": "zip", - "url": "https://api.github.com/repos/doctrine/migrations/zipball/ffd8355cdd8505fc650d9604f058bf62aedd80a1", - "reference": "ffd8355cdd8505fc650d9604f058bf62aedd80a1", + "url": "https://api.github.com/repos/doctrine/migrations/zipball/96cb2a89b56c9efb0bac38e606dc0b0f13e650ec", + "reference": "96cb2a89b56c9efb0bac38e606dc0b0f13e650ec", "shasum": "" }, "require": { @@ -3966,7 +3966,7 @@ ], "support": { "issues": "https://github.com/doctrine/migrations/issues", - "source": "https://github.com/doctrine/migrations/tree/3.9.6" + "source": "https://github.com/doctrine/migrations/tree/3.9.7" }, "funding": [ { @@ -3982,7 +3982,7 @@ "type": "tidelift" } ], - "time": "2026-02-11T06:46:11+00:00" + "time": "2026-04-23T19:33:20+00:00" }, { "name": "doctrine/orm", @@ -4074,19 +4074,20 @@ }, { "name": "doctrine/persistence", - "version": "4.1.1", + "version": "4.2.0", "source": { "type": "git", "url": "https://github.com/doctrine/persistence.git", - "reference": "b9c49ad3558bb77ef973f4e173f2e9c2eca9be09" + "reference": "49ab73e0d3e2ac8d1f5ecda3dd8acd5503781e8b" }, "dist": { "type": "zip", - "url": "https://api.github.com/repos/doctrine/persistence/zipball/b9c49ad3558bb77ef973f4e173f2e9c2eca9be09", - "reference": "b9c49ad3558bb77ef973f4e173f2e9c2eca9be09", + "url": "https://api.github.com/repos/doctrine/persistence/zipball/49ab73e0d3e2ac8d1f5ecda3dd8acd5503781e8b", + "reference": "49ab73e0d3e2ac8d1f5ecda3dd8acd5503781e8b", "shasum": "" }, "require": { + "doctrine/deprecations": "^1", "doctrine/event-manager": "^1 || ^2", "php": "^8.1", "psr/cache": "^1.0 || ^2.0 || ^3.0" @@ -4097,13 +4098,13 @@ "phpstan/phpstan-phpunit": "^2", "phpstan/phpstan-strict-rules": "^2", "phpunit/phpunit": "^10.5.58 || ^12", - "symfony/cache": "^4.4 || ^5.4 || ^6.0 || ^7.0", - "symfony/finder": "^4.4 || ^5.4 || ^6.0 || ^7.0" + "symfony/cache": "^4.4 || ^5.4 || ^6.0 || ^7.0 || ^8.0", + "symfony/finder": "^4.4 || ^5.4 || ^6.0 || ^7.0 || ^8.0" }, "type": "library", "autoload": { "psr-4": { - "Doctrine\\Persistence\\": "src/Persistence" + "Doctrine\\Persistence\\": "src" } }, "notification-url": "https://packagist.org/downloads/", @@ -4147,7 +4148,7 @@ ], "support": { "issues": "https://github.com/doctrine/persistence/issues", - "source": "https://github.com/doctrine/persistence/tree/4.1.1" + "source": "https://github.com/doctrine/persistence/tree/4.2.0" }, "funding": [ { @@ -4163,7 +4164,7 @@ "type": "tidelift" } ], - "time": "2025-10-16T20:13:18+00:00" + "time": "2026-04-26T12:12:52+00:00" }, { "name": "doctrine/sql-formatter", @@ -5534,6 +5535,191 @@ ], "time": "2023-05-21T07:57:08+00:00" }, + { + "name": "jkphl/dom-factory", + "version": "v1.0.1", + "source": { + "type": "git", + "url": "https://github.com/jkphl/dom-factory.git", + "reference": "dd32b8b2cc800f065c0eff8bb621d9f80147d45e" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/jkphl/dom-factory/zipball/dd32b8b2cc800f065c0eff8bb621d9f80147d45e", + "reference": "dd32b8b2cc800f065c0eff8bb621d9f80147d45e", + "shasum": "" + }, + "require": { + "ext-dom": "*", + "ext-libxml": "*", + "ext-mbstring": "*", + "guzzlehttp/guzzle": "^6.0||^7.0", + "masterminds/html5": "^2.7", + "php": ">=7.2" + }, + "require-dev": { + "clue/graph-composer": "^1.1", + "php-coveralls/php-coveralls": "^2.2", + "phpunit/phpunit": "^8.0||^9.0", + "squizlabs/php_codesniffer": "^3.5" + }, + "type": "library", + "autoload": { + "psr-4": { + "Jkphl\\": "src/" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Joschi Kuphal", + "email": "joschi@kuphal.net", + "homepage": "https://jkphl.is", + "role": "Developer" + } + ], + "description": "Simple HTML5/XML DOM factory", + "homepage": "https://github.com/jkphl/dom-factory", + "support": { + "email": "joschi@kuphal.net", + "issues": "https://github.com/jkphl/dom-factory/issues", + "source": "https://github.com/jkphl/dom-factory" + }, + "time": "2021-06-28T11:49:36+00:00" + }, + { + "name": "jkphl/micrometa", + "version": "dev-master", + "source": { + "type": "git", + "url": "https://github.com/jbtronics/micrometa.git", + "reference": "720f409151c2cc20add9478b7a0a635fa1707021" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/jbtronics/micrometa/zipball/720f409151c2cc20add9478b7a0a635fa1707021", + "reference": "720f409151c2cc20add9478b7a0a635fa1707021", + "shasum": "" + }, + "require": { + "ext-dom": "*", + "jkphl/dom-factory": "^1", + "jkphl/rdfa-lite-microdata": "^0.4.4", + "league/uri": "^5.0|^6.5|^7.0", + "mf2/mf2": "^0.4", + "ml/json-ld": "^1.2", + "monolog/monolog": "^1.24 || ^2 || ^3", + "php": ">=7.1.3", + "psr/cache": "^1.0|^2|^3", + "psr/log": "^1.1|^2|^3", + "symfony/cache": "^4.0|^5.0|^6.0|^7.0|^8.0" + }, + "require-dev": { + "clue/graph-composer": "^1.1", + "mf2/tests": "@dev", + "php-coveralls/php-coveralls": "^2.1", + "phpunit/phpunit": "^7.0 || ^8.5", + "squizlabs/php_codesniffer": "^3.3" + }, + "default-branch": true, + "type": "library", + "autoload": { + "psr-4": { + "Jkphl\\": "src/" + } + }, + "scripts": { + "phpunit": [ + "vendor/bin/phpunit --configuration phpunit.xml.dist" + ], + "depgraph": [ + "vendor/bin/graph-composer --no-dev export . doc/dependencies.svg" + ], + "check-style": [ + "vendor/bin/phpcs -p --standard=PSR2 --runtime-set ignore_errors_on_exit 1 --runtime-set ignore_warnings_on_exit 1 src" + ], + "fix-style": [ + "vendor/bin/phpcbf -p --standard=PSR2 --runtime-set ignore_errors_on_exit 1 --runtime-set ignore_warnings_on_exit 1 src" + ], + "test": [ + "@phpunit" + ] + }, + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Joschi Kuphal", + "email": "joschi@tollwerk.de", + "homepage": "https://jkphl.is", + "role": "Developer" + } + ], + "description": "A meta parser for extracting micro information out of web documents, currently supporting Microformats 1+2, HTML Microdata, RDFa Lite 1.1 and JSON-LD", + "homepage": "https://jkphl.is/projects/micrometa/", + "support": { + "email": "joschi@tollwerk.de", + "source": "https://github.com/jkphl/micrometa", + "issues": "https://github.com/jkphl/micrometa/issues" + }, + "time": "2026-04-26T17:25:19+00:00" + }, + { + "name": "jkphl/rdfa-lite-microdata", + "version": "v0.4.7", + "source": { + "type": "git", + "url": "https://github.com/jkphl/rdfa-lite-microdata.git", + "reference": "ffc4940e8be55798257a03da7ed7d4506a13c3e5" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/jkphl/rdfa-lite-microdata/zipball/ffc4940e8be55798257a03da7ed7d4506a13c3e5", + "reference": "ffc4940e8be55798257a03da7ed7d4506a13c3e5", + "shasum": "" + }, + "require": { + "jkphl/dom-factory": "^1", + "php": ">=5.5" + }, + "require-dev": { + "clue/graph-composer": "dev-master", + "codeclimate/php-test-reporter": "^0.4.4", + "phpunit/phpunit": "^4.8", + "satooshi/php-coveralls": "^1.0", + "squizlabs/php_codesniffer": "^2.8" + }, + "type": "library", + "autoload": { + "psr-4": { + "Jkphl\\": "src/" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Joschi Kuphal", + "email": "joschi@tollwerk.de", + "homepage": "https://jkphl.is", + "role": "Developer" + } + ], + "description": "RDFa Lite 1.1 and HTML Microdata parser for web documents (HTML, SVG, XML)", + "homepage": "https://github.com/jkphl/rdfa-lite-microdata", + "support": { + "email": "joschi@tollwerk.de", + "issues": "https://github.com/jkphl/rdfa-lite-microdata/issues", + "source": "https://github.com/jkphl/rdfa-lite-microdata" + }, + "time": "2023-01-27T13:29:45+00:00" + }, { "name": "kelunik/certificate", "version": "v1.1.3", @@ -6899,6 +7085,170 @@ }, "time": "2025-07-25T09:04:22+00:00" }, + { + "name": "mf2/mf2", + "version": "0.4.6", + "source": { + "type": "git", + "url": "https://github.com/microformats/php-mf2.git", + "reference": "00b70ee7eb7f5b0585b1bd467f6c9cbd75055d23" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/microformats/php-mf2/zipball/00b70ee7eb7f5b0585b1bd467f6c9cbd75055d23", + "reference": "00b70ee7eb7f5b0585b1bd467f6c9cbd75055d23", + "shasum": "" + }, + "require": { + "php": ">=5.4.0" + }, + "require-dev": { + "mf2/tests": "@dev", + "phpdocumentor/phpdocumentor": "v2.8.4", + "phpunit/phpunit": "4.8.*" + }, + "suggest": { + "barnabywalters/mf-cleaner": "To more easily handle the canonical data php-mf2 gives you", + "masterminds/html5": "Alternative HTML parser for PHP, for better HTML5 support." + }, + "bin": [ + "bin/fetch-mf2", + "bin/parse-mf2" + ], + "type": "library", + "autoload": { + "files": [ + "Mf2/Parser.php" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "CC0-1.0" + ], + "authors": [ + { + "name": "Barnaby Walters", + "homepage": "http://waterpigs.co.uk" + } + ], + "description": "A pure, generic microformats2 parser — makes HTML as easy to consume as a JSON API", + "keywords": [ + "html", + "microformats", + "microformats 2", + "parser", + "semantic" + ], + "support": { + "issues": "https://github.com/microformats/php-mf2/issues", + "source": "https://github.com/microformats/php-mf2/tree/master" + }, + "time": "2018-08-24T14:47:04+00:00" + }, + { + "name": "ml/iri", + "version": "1.1.4", + "target-dir": "ML/IRI", + "source": { + "type": "git", + "url": "https://github.com/lanthaler/IRI.git", + "reference": "cbd44fa913e00ea624241b38cefaa99da8d71341" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/lanthaler/IRI/zipball/cbd44fa913e00ea624241b38cefaa99da8d71341", + "reference": "cbd44fa913e00ea624241b38cefaa99da8d71341", + "shasum": "" + }, + "require": { + "lib-pcre": ">=4.0", + "php": ">=5.3.0" + }, + "type": "library", + "autoload": { + "psr-0": { + "ML\\IRI": "" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Markus Lanthaler", + "email": "mail@markus-lanthaler.com", + "homepage": "http://www.markus-lanthaler.com", + "role": "Developer" + } + ], + "description": "IRI handling for PHP", + "homepage": "http://www.markus-lanthaler.com", + "keywords": [ + "URN", + "iri", + "uri", + "url" + ], + "support": { + "issues": "https://github.com/lanthaler/IRI/issues", + "source": "https://github.com/lanthaler/IRI/tree/master" + }, + "time": "2014-01-21T13:43:39+00:00" + }, + { + "name": "ml/json-ld", + "version": "1.2.1", + "source": { + "type": "git", + "url": "https://github.com/lanthaler/JsonLD.git", + "reference": "537e68e87a6bce23e57c575cd5dcac1f67ce25d8" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/lanthaler/JsonLD/zipball/537e68e87a6bce23e57c575cd5dcac1f67ce25d8", + "reference": "537e68e87a6bce23e57c575cd5dcac1f67ce25d8", + "shasum": "" + }, + "require": { + "ext-json": "*", + "ml/iri": "^1.1.1", + "php": ">=5.3.0" + }, + "require-dev": { + "json-ld/tests": "1.0", + "phpunit/phpunit": "^4" + }, + "type": "library", + "autoload": { + "psr-4": { + "ML\\JsonLD\\": "" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Markus Lanthaler", + "email": "mail@markus-lanthaler.com", + "homepage": "http://www.markus-lanthaler.com", + "role": "Developer" + } + ], + "description": "JSON-LD Processor for PHP", + "homepage": "http://www.markus-lanthaler.com", + "keywords": [ + "JSON-LD", + "jsonld" + ], + "support": { + "issues": "https://github.com/lanthaler/JsonLD/issues", + "source": "https://github.com/lanthaler/JsonLD/tree/1.2.1" + }, + "time": "2022-09-29T08:45:17+00:00" + }, { "name": "monolog/monolog", "version": "3.10.0", @@ -9409,16 +9759,16 @@ }, { "name": "rhukster/dom-sanitizer", - "version": "1.0.10", + "version": "1.0.11", "source": { "type": "git", "url": "https://github.com/rhukster/dom-sanitizer.git", - "reference": "49a98046b708a4c92f754f5b0ef1720bb85142e2" + "reference": "02d08ec8b36b93b04517d74fe82b715ef06273bd" }, "dist": { "type": "zip", - "url": "https://api.github.com/repos/rhukster/dom-sanitizer/zipball/49a98046b708a4c92f754f5b0ef1720bb85142e2", - "reference": "49a98046b708a4c92f754f5b0ef1720bb85142e2", + "url": "https://api.github.com/repos/rhukster/dom-sanitizer/zipball/02d08ec8b36b93b04517d74fe82b715ef06273bd", + "reference": "02d08ec8b36b93b04517d74fe82b715ef06273bd", "shasum": "" }, "require": { @@ -9448,9 +9798,9 @@ "description": "A simple but effective DOM/SVG/MathML Sanitizer for PHP 7.4+", "support": { "issues": "https://github.com/rhukster/dom-sanitizer/issues", - "source": "https://github.com/rhukster/dom-sanitizer/tree/1.0.10" + "source": "https://github.com/rhukster/dom-sanitizer/tree/1.0.11" }, - "time": "2026-04-10T17:00:11+00:00" + "time": "2026-04-23T22:56:32+00:00" }, { "name": "robrichards/xmlseclibs", @@ -13693,7 +14043,7 @@ }, { "name": "symfony/polyfill-ctype", - "version": "v1.36.0", + "version": "v1.37.0", "source": { "type": "git", "url": "https://github.com/symfony/polyfill-ctype.git", @@ -13752,7 +14102,7 @@ "portable" ], "support": { - "source": "https://github.com/symfony/polyfill-ctype/tree/v1.36.0" + "source": "https://github.com/symfony/polyfill-ctype/tree/v1.37.0" }, "funding": [ { @@ -13776,16 +14126,16 @@ }, { "name": "symfony/polyfill-intl-grapheme", - "version": "v1.36.0", + "version": "v1.37.0", "source": { "type": "git", "url": "https://github.com/symfony/polyfill-intl-grapheme.git", - "reference": "ad1b7b9092976d6c948b8a187cec9faaea9ec1df" + "reference": "4864388bfbd3001ce88e234fab652acd91fdc57e" }, "dist": { "type": "zip", - "url": "https://api.github.com/repos/symfony/polyfill-intl-grapheme/zipball/ad1b7b9092976d6c948b8a187cec9faaea9ec1df", - "reference": "ad1b7b9092976d6c948b8a187cec9faaea9ec1df", + "url": "https://api.github.com/repos/symfony/polyfill-intl-grapheme/zipball/4864388bfbd3001ce88e234fab652acd91fdc57e", + "reference": "4864388bfbd3001ce88e234fab652acd91fdc57e", "shasum": "" }, "require": { @@ -13834,7 +14184,7 @@ "shim" ], "support": { - "source": "https://github.com/symfony/polyfill-intl-grapheme/tree/v1.36.0" + "source": "https://github.com/symfony/polyfill-intl-grapheme/tree/v1.37.0" }, "funding": [ { @@ -13854,11 +14204,11 @@ "type": "tidelift" } ], - "time": "2026-04-10T16:19:22+00:00" + "time": "2026-04-26T13:13:48+00:00" }, { "name": "symfony/polyfill-intl-icu", - "version": "v1.36.0", + "version": "v1.37.0", "source": { "type": "git", "url": "https://github.com/symfony/polyfill-intl-icu.git", @@ -13922,7 +14272,7 @@ "shim" ], "support": { - "source": "https://github.com/symfony/polyfill-intl-icu/tree/v1.36.0" + "source": "https://github.com/symfony/polyfill-intl-icu/tree/v1.37.0" }, "funding": [ { @@ -13946,7 +14296,7 @@ }, { "name": "symfony/polyfill-intl-idn", - "version": "v1.36.0", + "version": "v1.37.0", "source": { "type": "git", "url": "https://github.com/symfony/polyfill-intl-idn.git", @@ -14009,7 +14359,7 @@ "shim" ], "support": { - "source": "https://github.com/symfony/polyfill-intl-idn/tree/v1.36.0" + "source": "https://github.com/symfony/polyfill-intl-idn/tree/v1.37.0" }, "funding": [ { @@ -14033,7 +14383,7 @@ }, { "name": "symfony/polyfill-intl-normalizer", - "version": "v1.36.0", + "version": "v1.37.0", "source": { "type": "git", "url": "https://github.com/symfony/polyfill-intl-normalizer.git", @@ -14094,7 +14444,7 @@ "shim" ], "support": { - "source": "https://github.com/symfony/polyfill-intl-normalizer/tree/v1.36.0" + "source": "https://github.com/symfony/polyfill-intl-normalizer/tree/v1.37.0" }, "funding": [ { @@ -14118,7 +14468,7 @@ }, { "name": "symfony/polyfill-php83", - "version": "v1.36.0", + "version": "v1.37.0", "source": { "type": "git", "url": "https://github.com/symfony/polyfill-php83.git", @@ -14174,7 +14524,7 @@ "shim" ], "support": { - "source": "https://github.com/symfony/polyfill-php83/tree/v1.36.0" + "source": "https://github.com/symfony/polyfill-php83/tree/v1.37.0" }, "funding": [ { @@ -14198,7 +14548,7 @@ }, { "name": "symfony/polyfill-php84", - "version": "v1.36.0", + "version": "v1.37.0", "source": { "type": "git", "url": "https://github.com/symfony/polyfill-php84.git", @@ -14254,7 +14604,7 @@ "shim" ], "support": { - "source": "https://github.com/symfony/polyfill-php84/tree/v1.36.0" + "source": "https://github.com/symfony/polyfill-php84/tree/v1.37.0" }, "funding": [ { @@ -14278,16 +14628,16 @@ }, { "name": "symfony/polyfill-php85", - "version": "v1.36.0", + "version": "v1.37.0", "source": { "type": "git", "url": "https://github.com/symfony/polyfill-php85.git", - "reference": "2c408a6bb0313e6001a83628dc5506100474254e" + "reference": "fcfa4973a9917cef23f2e38774da74a2b7d115ee" }, "dist": { "type": "zip", - "url": "https://api.github.com/repos/symfony/polyfill-php85/zipball/2c408a6bb0313e6001a83628dc5506100474254e", - "reference": "2c408a6bb0313e6001a83628dc5506100474254e", + "url": "https://api.github.com/repos/symfony/polyfill-php85/zipball/fcfa4973a9917cef23f2e38774da74a2b7d115ee", + "reference": "fcfa4973a9917cef23f2e38774da74a2b7d115ee", "shasum": "" }, "require": { @@ -14334,7 +14684,7 @@ "shim" ], "support": { - "source": "https://github.com/symfony/polyfill-php85/tree/v1.36.0" + "source": "https://github.com/symfony/polyfill-php85/tree/v1.37.0" }, "funding": [ { @@ -14354,11 +14704,11 @@ "type": "tidelift" } ], - "time": "2026-04-10T16:50:15+00:00" + "time": "2026-04-26T13:10:57+00:00" }, { "name": "symfony/polyfill-uuid", - "version": "v1.36.0", + "version": "v1.37.0", "source": { "type": "git", "url": "https://github.com/symfony/polyfill-uuid.git", @@ -14417,7 +14767,7 @@ "uuid" ], "support": { - "source": "https://github.com/symfony/polyfill-uuid/tree/v1.36.0" + "source": "https://github.com/symfony/polyfill-uuid/tree/v1.37.0" }, "funding": [ { @@ -19854,12 +20204,12 @@ "source": { "type": "git", "url": "https://github.com/Roave/SecurityAdvisories.git", - "reference": "10b8a93511210c9bae3be31f4fe13c3ff974cad4" + "reference": "08cd07f04fb07fb4d316e956801d57b700cf7096" }, "dist": { "type": "zip", - "url": "https://api.github.com/repos/Roave/SecurityAdvisories/zipball/10b8a93511210c9bae3be31f4fe13c3ff974cad4", - "reference": "10b8a93511210c9bae3be31f4fe13c3ff974cad4", + "url": "https://api.github.com/repos/Roave/SecurityAdvisories/zipball/08cd07f04fb07fb4d316e956801d57b700cf7096", + "reference": "08cd07f04fb07fb4d316e956801d57b700cf7096", "shasum": "" }, "conflict": { @@ -19882,6 +20232,7 @@ "alextselegidis/easyappointments": "<=1.5.2", "alexusmai/laravel-file-manager": "<=3.3.1", "algolia/algoliasearch-magento-2": "<=3.16.1|>=3.17.0.0-beta1,<=3.17.1", + "almirhodzic/nova-toggle-5": "<1.3", "alt-design/alt-redirect": "<1.6.4", "altcha-org/altcha": "<1.3.1", "alterphp/easyadmin-extension-bundle": ">=1.2,<1.2.11|>=1.3,<1.3.1", @@ -19978,7 +20329,7 @@ "ckeditor/ckeditor": "<4.25", "clickstorm/cs-seo": ">=6,<6.8|>=7,<7.5|>=8,<8.4|>=9,<9.3", "co-stack/fal_sftp": "<0.2.6", - "cockpit-hq/cockpit": "<2.13.5", + "cockpit-hq/cockpit": "<2.14", "code16/sharp": "<9.20", "codeception/codeception": "<3.1.3|>=4,<4.1.22", "codeigniter/framework": "<3.1.10", @@ -20141,7 +20492,7 @@ "fisharebest/webtrees": "<=2.1.18", "fixpunkt/fp-masterquiz": "<2.2.1|>=3,<3.5.2", "fixpunkt/fp-newsletter": "<1.1.1|>=1.2,<2.1.2|>=2.2,<3.2.6", - "flarum/core": "<1.8.10", + "flarum/core": "<=1.8.15|>=2.0.0.0-beta1,<=2.0.0.0-beta8", "flarum/flarum": "<0.1.0.0-beta8", "flarum/framework": "<1.8.10", "flarum/mentions": "<1.6.3", @@ -20178,7 +20529,7 @@ "geshi/geshi": "<=1.0.9.1", "getformwork/formwork": "<=2.3.3", "getgrav/grav": "<1.11.0.0-beta1", - "getkirby/cms": "<=5.2.1", + "getkirby/cms": "<5.4", "getkirby/kirby": "<3.9.8.3-dev|>=3.10,<3.10.1.2-dev|>=4,<4.7.1", "getkirby/panel": "<2.5.14", "getkirby/starterkit": "<=3.7.0.2", @@ -20276,7 +20627,7 @@ "kelvinmo/simplexrd": "<3.1.1", "kevinpapst/kimai2": "<1.16.7", "khodakhah/nodcms": "<=3.4.1", - "kimai/kimai": "<=2.53", + "kimai/kimai": "<2.54", "kitodo/presentation": "<3.2.3|>=3.3,<3.3.4", "klaviyo/magento2-extension": ">=1,<3", "knplabs/knp-snappy": "<=1.4.2", @@ -20720,7 +21071,7 @@ "twig/twig": "<3.11.2|>=3.12,<3.14.1|>=3.16,<3.19", "typicms/core": "<16.1.7", "typo3/cms": "<9.5.29|>=10,<10.4.35|>=11,<11.5.23|>=12,<12.2", - "typo3/cms-backend": "<4.1.14|>=4.2,<4.2.15|>=4.3,<4.3.7|>=4.4,<4.4.4|>=7,<=7.6.50|>=8,<=8.7.39|>=9,<9.5.55|>=10,<=10.4.54|>=11,<=11.5.48|>=12,<=12.4.40|>=13,<=13.4.22|>=14,<=14.0.1", + "typo3/cms-backend": "<4.1.14|>=4.2,<4.2.15|>=4.3,<4.3.7|>=4.4,<4.4.4|>=7,<=7.6.50|>=8,<=8.7.39|>=9,<9.5.55|>=10,<=10.4.54|>=11,<=11.5.48|>=12,<=12.4.40|>=13,<=13.4.22|>=14,<=14.0.1|==14.2", "typo3/cms-belog": ">=10,<=10.4.47|>=11,<=11.5.41|>=12,<=12.4.24|>=13,<=13.4.2", "typo3/cms-beuser": ">=9,<9.5.55|>=10,<10.4.54|>=11,<11.5.48|>=12,<12.4.37|>=13,<13.4.18", "typo3/cms-core": "<=8.7.56|>=9,<9.5.55|>=10,<=10.4.54|>=11,<=11.5.48|>=12,<=12.4.40|>=13,<=13.4.22|>=14,<=14.0.1", @@ -20902,7 +21253,7 @@ "type": "tidelift" } ], - "time": "2026-04-22T18:27:19+00:00" + "time": "2026-04-24T17:22:29+00:00" }, { "name": "sebastian/cli-parser", @@ -22418,6 +22769,7 @@ "aliases": [], "minimum-stability": "stable", "stability-flags": { + "jkphl/micrometa": 20, "roave/security-advisories": 20 }, "prefer-stable": false, diff --git a/src/Services/InfoProviderSystem/DTOJsonSchemaConverter.php b/src/Services/InfoProviderSystem/DTOJsonSchemaConverter.php index b9208c87..2d297243 100644 --- a/src/Services/InfoProviderSystem/DTOJsonSchemaConverter.php +++ b/src/Services/InfoProviderSystem/DTOJsonSchemaConverter.php @@ -54,7 +54,8 @@ final class DTOJsonSchemaConverter 'category' => ['type' => ['string', 'null'], 'description' => 'Product category'], 'manufacturing_status' => ['type' => ['string', 'null'], 'enum' => ['active', 'obsolete', 'nrfnd', 'discontinued', null], 'description' => 'Manufacturing status'], 'footprint' => ['type' => ['string', 'null'], 'description' => 'Package/footprint type'], - 'mass' => ['type' => ['number', 'null'], 'description' => 'Mass in grams'], + 'mass' => ['type' => ['number', 'null'], 'description' => 'Mass of the product in grams'], + 'gtin' => ['type' => ['string', 'null'], 'description' => 'Global Trade Item Number (GTIN) / EAN / UPC code'], 'parameters' => [ 'type' => 'array', 'items' => [ @@ -94,17 +95,17 @@ final class DTOJsonSchemaConverter 'items' => [ 'type' => 'object', 'properties' => [ - 'distributor_name' => ['type' => 'string'], - 'order_number' => ['type' => ['string', 'null']], + 'distributor_name' => ['type' => 'string', 'description' => 'Name of the distributor or vendor. Typically the shop name'], + 'order_number' => ['type' => ['string', 'null'], 'description' => 'The order number or SKU used by the distributor. Optional, but can help to find the product on the distributor website.'], 'product_url' => ['type' => 'string'], 'prices' => [ 'type' => 'array', 'items' => [ 'type' => 'object', 'properties' => [ - 'minimum_quantity' => ['type' => 'integer'], - 'price' => ['type' => 'number'], - 'currency' => ['type' => 'string'], + 'minimum_quantity' => ['type' => 'integer', 'description' => 'Minimum quantity for this price tier. 1 when no tiered pricing is available.'], + 'price' => ['type' => 'number', 'description' => 'Price for the given minimum quantity.'], + 'currency' => ['type' => 'string', 'description' => 'Currency ISO code, e.g. USD'], ], 'required' => ['minimum_quantity', 'price', 'currency'], ], @@ -226,6 +227,7 @@ final class DTOJsonSchemaConverter manufacturing_status: $manufacturingStatus, provider_url: $productUrl, footprint: $data['footprint'] ?? null, + gtin: $data['gtin'] ?? null, notes: null, datasheets: $datasheets, images: $images, diff --git a/src/Services/InfoProviderSystem/Providers/AIInfoExtractor.php b/src/Services/InfoProviderSystem/Providers/AIInfoExtractor.php index c8eff0a4..7ae858a6 100644 --- a/src/Services/InfoProviderSystem/Providers/AIInfoExtractor.php +++ b/src/Services/InfoProviderSystem/Providers/AIInfoExtractor.php @@ -29,10 +29,15 @@ use App\Services\AI\AIPlatformRegistry; use App\Services\InfoProviderSystem\DTOJsonSchemaConverter; use App\Services\InfoProviderSystem\DTOs\PartDetailDTO; use App\Settings\InfoProviderSystem\AIExtractorSettings; +use Brick\Schema\SchemaReader; +use Jkphl\Micrometa; +use League\HTMLToMarkdown\HtmlConverter; use Symfony\AI\Platform\Message\Message; use Symfony\AI\Platform\Message\MessageBag; +use Symfony\Component\DomCrawler\Crawler; use Symfony\Contracts\HttpClient\HttpClientInterface; + final class AIInfoExtractor implements InfoProviderInterface { use FixAndValidateUrlTrait; @@ -95,16 +100,56 @@ final class AIInfoExtractor implements InfoProviderInterface $html = $response->getContent(); // Clean HTML - $cleanedHtml = $this->cleanHTML($html); + /*$cleanedHtml = $this->cleanHTML($html); // Truncate to max content length - $truncatedHtml = $this->truncateHTML($cleanedHtml, $this->settings->maxContentLength); + $truncatedHtml = $this->truncateHTML($cleanedHtml, $this->settings->maxContentLength);*/ + + $markdown = $this->htmlToMarkdown($html); + + //Extract structured data using traditional methods, to provide additional context to the LLM. This can help improve accuracy, especially for technical specifications that might be in tables or specific formats. + $structuredData = $this->extractStructuredData($html, $url); // Call LLM - $llmResponse = $this->callLLM($truncatedHtml, $url); + $llmResponse = $this->callLLM($markdown, $url, $structuredData); // Build and return PartDetailDTO - return $this->jsonSchemaConverter->jsonToDTO($llmResponse, $this->getProviderKey(), $url, $url, self::DISTRIBUTOR_NAME); + $result = $this->jsonSchemaConverter->jsonToDTO($llmResponse, $this->getProviderKey(), $url, $url, self::DISTRIBUTOR_NAME); + + return $result; + } + + /** + * Extracts structured data from the HTML using microformats. + * @param string $html + * @param string $url + * @return string JSON encoded structured data + */ + private function extractStructuredData(string $html, string $url): string + { + $micrometa = new Micrometa\Ports\Parser(); + $items = $micrometa($url, $html); + + return json_encode($items->toObject(), JSON_THROW_ON_ERROR); + } + + private function htmlToMarkdown(string $html): string + { + //Extract only the main content of the page to avoid overwhelming the LLM with irrelevant information. + $crawler = new Crawler($html); + $mainContent = $crawler->filter('main, article, #content')->first(); + + // If we found a specific content area, get its HTML; otherwise, use the whole body. + $htmlToConvert = $mainContent->count() ? $mainContent->html() : $html; + + //Concert to markdown + $converter = new HtmlConverter([ + 'strip_tags' => true, // Removes tags that aren't Markdown-compatible (like
) + 'hard_break' => true, // Preserves line breaks + 'remove_nodes' => 'nav footer script style' // Extra safety layer + ]); + + return $converter->convert($htmlToConvert); } public function getCapabilities(): array @@ -160,13 +205,18 @@ final class AIInfoExtractor implements InfoProviderInterface return $truncated; } - private function callLLM(string $htmlContent, string $url): array + private function callLLM(string $htmlContent, string $url, ?string $structuredData = null): array { $input = new MessageBag( Message::forSystem($this->buildSystemPrompt()), Message::ofUser("Extract part information from this webpage content:\n\nURL: $url\n\n$htmlContent") ); + if ($structuredData) { + $input->add(Message::ofUser("Following data was extracted using traditional methods, but might be incomplete or inaccurate. + Enrich it with the actual website data:\n\n".$structuredData)); + } + try { $aiPlatform = $this->AIPlatformRegistry->getPlatform($this->settings->platform ?? throw new \RuntimeException('No AI platform selected') ); @@ -187,29 +237,8 @@ final class AIInfoExtractor implements InfoProviderInterface private function buildSystemPrompt(): string { return <<<'PROMPT' -You are an expert at extracting electronic component information from web pages. Extract structured data in JSON format. - -Return ONLY a valid JSON object with this exact structure: -{ - "name": "string", - "description": "string", - "manufacturer": "string | null", - "mpn": "string | null", - "category": "string | null", - "manufacturing_status": "active|obsolete|nrfnd|discontinued|null", - "footprint": "string | null", - "mass": "number | null (in grams)", - "parameters": [{"name": "string", "value": "string", "unit": "string | null"}], - "datasheets": [{"url": "string", "description": "string"}], - "images": [{"url": "string", "description": "string"}], - "vendor_infos": [{ - "distributor_name": "string", - "order_number": "string | null", - "product_url": "string", - "prices": [{"minimum_quantity": int, "price": number, "currency": "string"}] - }], - "manufacturer_product_url": "string | null" -} +You are an expert at extracting electronic component information from web pages. Extract structured data in JSON format, from markdown extracted from a product page. +Focus on the main content of the page, such as product descriptions, specifications, and tables. Ignore navigation menus, footers, and sidebars. Rules: - manufacturing_status: Use "active", "obsolete", "nrfnd" (not recommended for new designs), "discontinued", or null diff --git a/symfony.lock b/symfony.lock index dde7df36..f8f88675 100644 --- a/symfony.lock +++ b/symfony.lock @@ -441,12 +441,6 @@ "symfony/browser-kit": { "version": "v4.2.3" }, - "symfony/cache": { - "version": "v4.2.3" - }, - "symfony/cache-contracts": { - "version": "v1.1.5" - }, "symfony/config": { "version": "v4.2.3" }, From 4a45b5d5a9c439a81c0c99d3bedc2dbf1825a002 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20B=C3=B6hmer?= Date: Sun, 26 Apr 2026 21:31:07 +0200 Subject: [PATCH 2/7] Improved markdown conversion and add ability to extract notes --- .../DTOJsonSchemaConverter.php | 17 ++--- .../Providers/AIInfoExtractor.php | 62 ++++++------------- 2 files changed, 28 insertions(+), 51 deletions(-) diff --git a/src/Services/InfoProviderSystem/DTOJsonSchemaConverter.php b/src/Services/InfoProviderSystem/DTOJsonSchemaConverter.php index 2d297243..759c3d12 100644 --- a/src/Services/InfoProviderSystem/DTOJsonSchemaConverter.php +++ b/src/Services/InfoProviderSystem/DTOJsonSchemaConverter.php @@ -48,14 +48,15 @@ final class DTOJsonSchemaConverter 'type' => 'object', 'properties' => [ 'name' => ['type' => 'string', 'description' => 'Product name'], - 'description' => ['type' => 'string', 'description' => 'Product description'], + 'description' => ['type' => 'string', 'description' => 'A short description of the product, maybe containing the most important things. Onnly One line.'], 'manufacturer' => ['type' => ['string', 'null'], 'description' => 'Manufacturer name'], 'mpn' => ['type' => ['string', 'null'], 'description' => 'Manufacturer Part Number'], - 'category' => ['type' => ['string', 'null'], 'description' => 'Product category'], + 'category' => ['type' => ['string', 'null'], 'description' => 'Product category, e.g. "Passive components -> Resistors"'], 'manufacturing_status' => ['type' => ['string', 'null'], 'enum' => ['active', 'obsolete', 'nrfnd', 'discontinued', null], 'description' => 'Manufacturing status'], - 'footprint' => ['type' => ['string', 'null'], 'description' => 'Package/footprint type'], + 'footprint' => ['type' => ['string', 'null'], 'description' => 'Package/footprint type, like "SOT-23", "DIP-8", "QFN-32" etc.'], 'mass' => ['type' => ['number', 'null'], 'description' => 'Mass of the product in grams'], - 'gtin' => ['type' => ['string', 'null'], 'description' => 'Global Trade Item Number (GTIN) / EAN / UPC code'], + 'gtin' => ['type' => ['string', 'null'], 'description' => 'Global Trade Item Number (GTIN) / EAN / UPC code for barcodes'], + 'notes' => ['type' => ['string', 'null'], 'description' => 'Optional long description of the part with more details than description. Can be markdown formatted.'], 'parameters' => [ 'type' => 'array', 'items' => [ @@ -98,6 +99,7 @@ final class DTOJsonSchemaConverter 'distributor_name' => ['type' => 'string', 'description' => 'Name of the distributor or vendor. Typically the shop name'], 'order_number' => ['type' => ['string', 'null'], 'description' => 'The order number or SKU used by the distributor. Optional, but can help to find the product on the distributor website.'], 'product_url' => ['type' => 'string'], + 'prices_include_vat' => ['type' => ['boolean', 'null'], 'description' => 'Whether the prices include VAT or not. Null if unknown.'], 'prices' => [ 'type' => 'array', 'items' => [ @@ -194,8 +196,8 @@ final class DTOJsonSchemaConverter $prices[] = new PriceDTO( minimum_discount_amount: (int) ($p['minimum_quantity'] ?? 1), price: (string) ($p['price'] ?? 0), - currency_iso_code: $p['currency'] ?? 'USD', - price_related_quantity: (int) ($p['minimum_quantity'] ?? 1), + currency_iso_code: $p['currency'] ?? null, + price_related_quantity: 1, ); } } @@ -205,6 +207,7 @@ final class DTOJsonSchemaConverter order_number: $v['order_number'] ?? 'Unknown', prices: $prices, product_url: $v['product_url'] ?? $productUrl, + prices_include_vat: $v['prices_include_vat'] ?? null, ); } } @@ -228,7 +231,7 @@ final class DTOJsonSchemaConverter provider_url: $productUrl, footprint: $data['footprint'] ?? null, gtin: $data['gtin'] ?? null, - notes: null, + notes: $data['notes'], datasheets: $datasheets, images: $images, parameters: $parameters, diff --git a/src/Services/InfoProviderSystem/Providers/AIInfoExtractor.php b/src/Services/InfoProviderSystem/Providers/AIInfoExtractor.php index 7ae858a6..92ed4e19 100644 --- a/src/Services/InfoProviderSystem/Providers/AIInfoExtractor.php +++ b/src/Services/InfoProviderSystem/Providers/AIInfoExtractor.php @@ -37,6 +37,8 @@ use Symfony\AI\Platform\Message\MessageBag; use Symfony\Component\DomCrawler\Crawler; use Symfony\Contracts\HttpClient\HttpClientInterface; +use function Symfony\Component\String\u; + final class AIInfoExtractor implements InfoProviderInterface { @@ -105,7 +107,10 @@ final class AIInfoExtractor implements InfoProviderInterface // Truncate to max content length $truncatedHtml = $this->truncateHTML($cleanedHtml, $this->settings->maxContentLength);*/ + //Convert html to markdown, to provide a cleaner input to the LLM. $markdown = $this->htmlToMarkdown($html); + //Truncate markdown to max content length, if needed + $markdown = u($markdown)->truncate($this->settings->maxContentLength, '... [truncated]')->toString(); //Extract structured data using traditional methods, to provide additional context to the LLM. This can help improve accuracy, especially for technical specifications that might be in tables or specific formats. $structuredData = $this->extractStructuredData($html, $url); @@ -137,10 +142,21 @@ final class AIInfoExtractor implements InfoProviderInterface { //Extract only the main content of the page to avoid overwhelming the LLM with irrelevant information. $crawler = new Crawler($html); - $mainContent = $crawler->filter('main, article, #content')->first(); + $mainContent = $crawler->filter('main, article, #content'); // If we found a specific content area, get its HTML; otherwise, use the whole body. - $htmlToConvert = $mainContent->count() ? $mainContent->html() : $html; + //Concat the html of all matched nodes, to provide more context to the LLM, especially for pages that use multiple sections for product info. + if ($mainContent->count() > 0) { + $htmlToConvert = ''; + foreach ($mainContent as $node) { + $htmlToConvert .= $node->ownerDocument->saveHTML($node); + $htmlToConvert .= "\n\n"; // Add some spacing between sections + } + } else { + //Use the whole body content, as it might contain relevant information, especially for simpler pages that don't have a clear main/content section. + $htmlToConvert = $html; + } + //Concert to markdown $converter = new HtmlConverter([ @@ -163,48 +179,6 @@ final class AIInfoExtractor implements InfoProviderInterface ]; } - private function cleanHTML(string $html): string - { - // Remove script tags - $html = preg_replace('/]*>(.*?)<\/script>/is', '', $html); - - // Remove style tags - $html = preg_replace('/]*>(.*?)<\/style>/is', '', $html); - - // Remove nav tags - $html = preg_replace('/]*>(.*?)<\/nav>/is', '', $html); - - // Remove footer tags - $html = preg_replace('/]*>(.*?)<\/footer>/is', '', $html); - - // Remove header tags - $html = preg_replace('/]*>(.*?)<\/header>/is', '', $html); - - // Remove HTML comments - $html = preg_replace('//is', '', $html); - - return $html; - } - - private function truncateHTML(string $html, int $maxLength): string - { - if (strlen($html) <= $maxLength) { - return $html; - } - - // Truncate and find the last > or space to avoid cutting tags - $truncated = substr($html, 0, $maxLength); - - // Find the last occurrence of > or space - $lastPos = max(strrpos($truncated, '>'), strrpos($truncated, ' ')); - - if ($lastPos !== false && $lastPos > $maxLength * 0.9) { - $truncated = substr($truncated, 0, $lastPos + 1); - } - - return $truncated; - } - private function callLLM(string $htmlContent, string $url, ?string $structuredData = null): array { $input = new MessageBag( From 7117926584186c462056f4cad8dc4945ed770c79 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20B=C3=B6hmer?= Date: Sun, 26 Apr 2026 21:32:19 +0200 Subject: [PATCH 3/7] Fixed error when notes are not defined --- src/Services/InfoProviderSystem/DTOJsonSchemaConverter.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Services/InfoProviderSystem/DTOJsonSchemaConverter.php b/src/Services/InfoProviderSystem/DTOJsonSchemaConverter.php index 759c3d12..bfc41121 100644 --- a/src/Services/InfoProviderSystem/DTOJsonSchemaConverter.php +++ b/src/Services/InfoProviderSystem/DTOJsonSchemaConverter.php @@ -231,7 +231,7 @@ final class DTOJsonSchemaConverter provider_url: $productUrl, footprint: $data['footprint'] ?? null, gtin: $data['gtin'] ?? null, - notes: $data['notes'], + notes: $data['notes'] ?? null, datasheets: $datasheets, images: $images, parameters: $parameters, From 0ca5a41298796ba6a8ec56c15f87747f218472a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20B=C3=B6hmer?= Date: Sun, 26 Apr 2026 22:11:27 +0200 Subject: [PATCH 4/7] Added option for translating AI extracted output --- .../Providers/AIInfoExtractor.php | 11 ++++++++- .../AIExtractorSettings.php | 8 +++++++ translations/messages.en.xlf | 24 +++++++++++++++++++ 3 files changed, 42 insertions(+), 1 deletion(-) diff --git a/src/Services/InfoProviderSystem/Providers/AIInfoExtractor.php b/src/Services/InfoProviderSystem/Providers/AIInfoExtractor.php index 92ed4e19..2032f634 100644 --- a/src/Services/InfoProviderSystem/Providers/AIInfoExtractor.php +++ b/src/Services/InfoProviderSystem/Providers/AIInfoExtractor.php @@ -35,6 +35,7 @@ use League\HTMLToMarkdown\HtmlConverter; use Symfony\AI\Platform\Message\Message; use Symfony\AI\Platform\Message\MessageBag; use Symfony\Component\DomCrawler\Crawler; +use Symfony\Component\Intl\Languages; use Symfony\Contracts\HttpClient\HttpClientInterface; use function Symfony\Component\String\u; @@ -210,7 +211,7 @@ final class AIInfoExtractor implements InfoProviderInterface private function buildSystemPrompt(): string { - return <<<'PROMPT' + $tmp = <<<'PROMPT' You are an expert at extracting electronic component information from web pages. Extract structured data in JSON format, from markdown extracted from a product page. Focus on the main content of the page, such as product descriptions, specifications, and tables. Ignore navigation menus, footers, and sidebars. @@ -224,6 +225,14 @@ Rules: For parameters, combine name, value, and unit. The unit should be separate if possible. PROMPT; + + if ($this->settings->outputLanguage === null) { + $tmp .= "\n\nProvide the response in the same language of the webpage."; + } else { + $tmp .= "\n\nThe response must be in ". Languages::getName($this->settings->outputLanguage, 'en') ." language. Translate texts if needed."; + } + + return $tmp; } } diff --git a/src/Settings/InfoProviderSystem/AIExtractorSettings.php b/src/Settings/InfoProviderSystem/AIExtractorSettings.php index 876c687f..4ef9a1fa 100644 --- a/src/Settings/InfoProviderSystem/AIExtractorSettings.php +++ b/src/Settings/InfoProviderSystem/AIExtractorSettings.php @@ -32,7 +32,9 @@ use Jbtronics\SettingsBundle\Settings\Settings; use Jbtronics\SettingsBundle\Settings\SettingsParameter; use Jbtronics\SettingsBundle\Settings\SettingsTrait; use Symfony\AI\Platform\Capability; +use Symfony\Component\Form\Extension\Core\Type\LanguageType; use Symfony\Component\Translation\TranslatableMessage as TM; +use Symfony\Component\Validator\Constraints\Language; #[Settings(name: "ai_extractor", label: new TM("settings.ips.ai_extractor"), description: new TM("settings.ips.ai_extractor.description"))] #[SettingsIcon("fa-plug")] @@ -56,4 +58,10 @@ class AIExtractorSettings description: new TM("settings.ips.ai_extractor.max_content_length.description"), )] public int $maxContentLength = 50000; + + #[Language] + #[SettingsParameter(label: new TM("settings.ips.ai_extractor.output_language"), description: new TM("settings.ips.ai_extractor.output_language.description"), + formType: LanguageType::class, + )] + public ?string $outputLanguage = null; } diff --git a/translations/messages.en.xlf b/translations/messages.en.xlf index e16a6d69..c0bf8b60 100644 --- a/translations/messages.en.xlf +++ b/translations/messages.en.xlf @@ -13103,5 +13103,29 @@ Buerklin-API Authentication server: The AI model that should be used for extraction. Must support structured output. + + + settings.ips.ai_extractor.max_content_length + Max. Website Content length + + + + + settings.ips.ai_extractor.max_content_length.description + The maximum number of characters of the website that are sent to the AI service. + + + + + settings.ips.ai_extractor.output_language + Output language + + + + + settings.ips.ai_extractor.output_language.description + By default, the providers returns information in the same language as the website. With that option you can ask the AI to translate it for you. Might only work with certain models. + + From ad096aa6ff89bdae1cd090e0cbbd0992d05152f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20B=C3=B6hmer?= Date: Sun, 26 Apr 2026 23:15:29 +0200 Subject: [PATCH 5/7] Improved parameter extraction & extraction of other infos --- .../DTOJsonSchemaConverter.php | 26 ++++++++++++------- .../Providers/AIInfoExtractor.php | 7 +++-- 2 files changed, 20 insertions(+), 13 deletions(-) diff --git a/src/Services/InfoProviderSystem/DTOJsonSchemaConverter.php b/src/Services/InfoProviderSystem/DTOJsonSchemaConverter.php index bfc41121..a61e7465 100644 --- a/src/Services/InfoProviderSystem/DTOJsonSchemaConverter.php +++ b/src/Services/InfoProviderSystem/DTOJsonSchemaConverter.php @@ -63,13 +63,19 @@ final class DTOJsonSchemaConverter 'type' => 'object', 'properties' => [ 'name' => ['type' => 'string'], - 'value' => ['type' => 'string'], - 'unit' => ['type' => ['string', 'null']], + 'symbol' => ['type' => ['string', 'null'], 'description' => 'An optional quantity symbol for the parameter in latex code, like R_1'], + 'value_typical' => ['type' => ['number', 'null'], 'description' => 'The typical value of the parameter. For example, for a resistor this could be 100 for a 100 Ohm resistor. Also used if only one numeric value is given. If used an unit should be given'], + 'value_min' => ['type' => ['number', 'null'], 'description' => 'If a range is given for the parameter, this is the minimum value. Null if no range is given.'], + 'value_max' => ['type' => ['number', 'null'], 'description' => 'If a range is given for the parameter, this is the maximum value. Null if not a range.'], + 'value_text' => ['type' => ['string', 'null'], 'description' => 'When a value is not numeric it can be put here as text. Only use if it does not fit in value_min, value_typical or value_max. E.g. "Yes", "Red", etc.'], + 'group' => ['type' => ['string', 'null'], 'description' => 'An optional group name for the parameter, e.g. "Electrical parameters", "Mechanical parameters" etc.'], + 'unit' => ['type' => ['string', 'null'], 'description' => 'The unit of the parameter values, e.g. kg, Ohm, V, etc.'], ], - 'required' => ['name', 'value'], + 'required' => ['name', 'value_typical', 'value_min', 'value_max', 'value_text'] ], ], 'datasheets' => [ + 'description' => 'A list of datasheets, manuals, or other technical documents related to the product. Not images, but actual documents, preferably PDFs.', 'type' => 'array', 'items' => [ 'type' => 'object', @@ -145,13 +151,15 @@ final class DTOJsonSchemaConverter $parameters = []; foreach ($data['parameters'] as $p) { if (!empty($p['name'])) { - $value = $p['value'] ?? ''; - $unit = $p['unit'] ?? null; - // Combine value and unit for parsing - $valueWithUnit = $unit ? $value . ' ' . $unit : $value; - $parameters[] = ParameterDTO::parseValueField( + $parameters[] = new ParameterDTO( name: $p['name'], - value: $valueWithUnit + value_text: $p['value_text'] ?? null, + value_typ: isset($p['value_typical']) && is_numeric($p['value_typical']) ? (float) $p['value_typical'] : null, + value_min: isset($p['value_min']) && is_numeric($p['value_min']) ? (float) $p['value_min'] : null, + value_max: isset($p['value_max']) && is_numeric($p['value_max']) ? (float) $p['value_max'] : null, + unit: $p['unit'] ?? null, + symbol: $p['symbol'] ?? null, + group: $p['group'] ?? null, ); } } diff --git a/src/Services/InfoProviderSystem/Providers/AIInfoExtractor.php b/src/Services/InfoProviderSystem/Providers/AIInfoExtractor.php index 2032f634..379d97cc 100644 --- a/src/Services/InfoProviderSystem/Providers/AIInfoExtractor.php +++ b/src/Services/InfoProviderSystem/Providers/AIInfoExtractor.php @@ -217,13 +217,12 @@ Focus on the main content of the page, such as product descriptions, specificati Rules: - manufacturing_status: Use "active", "obsolete", "nrfnd" (not recommended for new designs), "discontinued", or null -- parameters: Extract technical specs like voltage, current, temperature, etc. +- parameters: Extract technical specs like voltage, current, temperature, etc. and put them into the fields according to the JSON schema. Include units if available. - prices: Extract pricing tiers with minimum_quantity, price, and currency code - URLs must be absolute (include https://...) - If information is not found, use null -- Return ONLY the JSON, no explanation text - -For parameters, combine name, value, and unit. The unit should be separate if possible. +- Try to avoid duplicating parameters, if the same parameter is mentioned multiple times, or if it is already used in another field. +- Include only the 1 to 3 most relevant images, such as the main product image or important diagrams. Ignore decorative images, logos, or icons. PROMPT; if ($this->settings->outputLanguage === null) { From 5edcc60d410628287dc9a73ffecb6065835fe54e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20B=C3=B6hmer?= Date: Sun, 26 Apr 2026 23:18:09 +0200 Subject: [PATCH 6/7] Randomize UserAgent and prevent access to private networks for AI extractor --- .../Providers/AIInfoExtractor.php | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/Services/InfoProviderSystem/Providers/AIInfoExtractor.php b/src/Services/InfoProviderSystem/Providers/AIInfoExtractor.php index 379d97cc..0c2f19d9 100644 --- a/src/Services/InfoProviderSystem/Providers/AIInfoExtractor.php +++ b/src/Services/InfoProviderSystem/Providers/AIInfoExtractor.php @@ -25,6 +25,7 @@ declare(strict_types=1); namespace App\Services\InfoProviderSystem\Providers; use App\Exceptions\ProviderIDNotSupportedException; +use App\Helpers\RandomizeUseragentHttpClient; use App\Services\AI\AIPlatformRegistry; use App\Services\InfoProviderSystem\DTOJsonSchemaConverter; use App\Services\InfoProviderSystem\DTOs\PartDetailDTO; @@ -35,6 +36,7 @@ use League\HTMLToMarkdown\HtmlConverter; use Symfony\AI\Platform\Message\Message; use Symfony\AI\Platform\Message\MessageBag; use Symfony\Component\DomCrawler\Crawler; +use Symfony\Component\HttpClient\NoPrivateNetworkHttpClient; use Symfony\Component\Intl\Languages; use Symfony\Contracts\HttpClient\HttpClientInterface; @@ -55,12 +57,12 @@ final class AIInfoExtractor implements InfoProviderInterface private readonly AIPlatformRegistry $AIPlatformRegistry, private readonly DTOJsonSchemaConverter $jsonSchemaConverter, ) { - $this->httpClient = $httpClient->withOptions([ - 'timeout' => 30, - 'headers' => [ - 'User-Agent' => 'Mozilla/5.0 (compatible; Part-DB AI-Extractor/1.0)', - ], - ]); + //Use NoPrivateNetworkHttpClient to prevent SSRF vulnerabilities, and RandomizeUseragentHttpClient to make it harder for servers to block us + $this->httpClient = (new RandomizeUseragentHttpClient(new NoPrivateNetworkHttpClient($httpClient)))->withOptions( + [ + 'timeout' => 15, + ] + ); } public function getProviderInfo(): array @@ -199,7 +201,7 @@ final class AIInfoExtractor implements InfoProviderInterface $result = $aiPlatform->invoke($this->settings->model ?? throw new \RuntimeException('No model selected'), $input, [ 'response_format' => [ 'type' => 'json_schema', - 'json_schema' => $this->jsonSchemaConverter->getJSONSchema(), + 'json_schema' => $this->jsonSchemaConverter->getJSONSchema(), ] ]); } catch (\Throwable $e) { From cf34de67729c28666b2c4e46a60064e543611379 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20B=C3=B6hmer?= Date: Sun, 26 Apr 2026 23:24:51 +0200 Subject: [PATCH 7/7] Allow to pass additional instructions to the AI model --- .../InfoProviderSystem/Providers/AIInfoExtractor.php | 4 ++++ .../InfoProviderSystem/AIExtractorSettings.php | 6 ++++++ .../InfoProviderSystem/InfoProviderSettings.php | 5 +++-- translations/messages.en.xlf | 12 ++++++++++++ 4 files changed, 25 insertions(+), 2 deletions(-) diff --git a/src/Services/InfoProviderSystem/Providers/AIInfoExtractor.php b/src/Services/InfoProviderSystem/Providers/AIInfoExtractor.php index 0c2f19d9..ef7f05a5 100644 --- a/src/Services/InfoProviderSystem/Providers/AIInfoExtractor.php +++ b/src/Services/InfoProviderSystem/Providers/AIInfoExtractor.php @@ -233,6 +233,10 @@ PROMPT; $tmp .= "\n\nThe response must be in ". Languages::getName($this->settings->outputLanguage, 'en') ." language. Translate texts if needed."; } + if ($this->settings->additionalInstructions) { + $tmp .= "\n\nAdditional instructions:\n" . $this->settings->additionalInstructions; + } + return $tmp; } diff --git a/src/Settings/InfoProviderSystem/AIExtractorSettings.php b/src/Settings/InfoProviderSystem/AIExtractorSettings.php index 4ef9a1fa..da5ef0f9 100644 --- a/src/Settings/InfoProviderSystem/AIExtractorSettings.php +++ b/src/Settings/InfoProviderSystem/AIExtractorSettings.php @@ -33,6 +33,7 @@ use Jbtronics\SettingsBundle\Settings\SettingsParameter; use Jbtronics\SettingsBundle\Settings\SettingsTrait; use Symfony\AI\Platform\Capability; use Symfony\Component\Form\Extension\Core\Type\LanguageType; +use Symfony\Component\Form\Extension\Core\Type\TextareaType; use Symfony\Component\Translation\TranslatableMessage as TM; use Symfony\Component\Validator\Constraints\Language; @@ -64,4 +65,9 @@ class AIExtractorSettings formType: LanguageType::class, )] public ?string $outputLanguage = null; + + #[SettingsParameter(label: new TM("settings.ips.ai_extractor.additional_instructions"), description: new TM("settings.ips.ai_extractor.additional_instructions.description"), + formType: TextareaType::class, + )] + public ?string $additionalInstructions = null; } diff --git a/src/Settings/InfoProviderSystem/InfoProviderSettings.php b/src/Settings/InfoProviderSystem/InfoProviderSettings.php index be0fe746..3e2a27ef 100644 --- a/src/Settings/InfoProviderSystem/InfoProviderSettings.php +++ b/src/Settings/InfoProviderSystem/InfoProviderSettings.php @@ -40,6 +40,9 @@ class InfoProviderSettings #[EmbeddedSettings] public ?GenericWebProviderSettings $genericWebProvider = null; + #[EmbeddedSettings] + public ?AIExtractorSettings $aiExtractor = null; + #[EmbeddedSettings] public ?DigikeySettings $digikey = null; @@ -76,6 +79,4 @@ class InfoProviderSettings #[EmbeddedSettings] public ?CanopySettings $canopy = null; - #[EmbeddedSettings] - public ?AIExtractorSettings $aiExtractor = null; } diff --git a/translations/messages.en.xlf b/translations/messages.en.xlf index c0bf8b60..d2c4bc47 100644 --- a/translations/messages.en.xlf +++ b/translations/messages.en.xlf @@ -13127,5 +13127,17 @@ Buerklin-API Authentication server: By default, the providers returns information in the same language as the website. With that option you can ask the AI to translate it for you. Might only work with certain models. + + + settings.ips.ai_extractor.additional_instructions + Additional instructions + + + + + settings.ips.ai_extractor.additional_instructions.description + The additional instructions will be appended to the system prompt. + +