From 4dbd92ac4d5d806f4f37275d66ad9a3bbbbb7d21 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20B=C3=B6hmer?= Date: Sun, 26 Apr 2026 19:36:03 +0200 Subject: [PATCH] Use markdown as input for the LLM and add extracted microdata separatley --- composer.json | 7 + composer.lock | 466 +++++++++++++++--- .../DTOJsonSchemaConverter.php | 14 +- .../Providers/AIInfoExtractor.php | 85 ++-- symfony.lock | 6 - 5 files changed, 481 insertions(+), 97 deletions(-) diff --git a/composer.json b/composer.json index c7c52d5c..c4b3cb59 100644 --- a/composer.json +++ b/composer.json @@ -33,6 +33,7 @@ "jbtronics/dompdf-font-loader-bundle": "^1.0.0", "jbtronics/settings-bundle": "^3.0.0", "jfcherng/php-diff": "^6.14", + "jkphl/micrometa": "dev-master", "knpuniversity/oauth2-client-bundle": "^2.15", "league/commonmark": "^2.7", "league/csv": "^9.8.0", @@ -159,6 +160,12 @@ "App\\Tests\\": "tests/" } }, + "repositories": [ + { + "type": "vcs", + "url": "https://github.com/jbtronics/micrometa" + } + ], "scripts": { "auto-scripts": { "cache:clear": "symfony-cmd", diff --git a/composer.lock b/composer.lock index 00b5e831..1963342f 100644 --- a/composer.lock +++ b/composer.lock @@ -4,7 +4,7 @@ "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies", "This file is @generated automatically" ], - "content-hash": "699f421ad81f8a1acacf8e2c4af66491", + "content-hash": "7c76e3af5fd042105a3208fdcb300a11", "packages": [ { "name": "amphp/amp", @@ -3883,16 +3883,16 @@ }, { "name": "doctrine/migrations", - "version": "3.9.6", + "version": "3.9.7", "source": { "type": "git", "url": "https://github.com/doctrine/migrations.git", - "reference": "ffd8355cdd8505fc650d9604f058bf62aedd80a1" + "reference": "96cb2a89b56c9efb0bac38e606dc0b0f13e650ec" }, "dist": { "type": "zip", - "url": "https://api.github.com/repos/doctrine/migrations/zipball/ffd8355cdd8505fc650d9604f058bf62aedd80a1", - "reference": "ffd8355cdd8505fc650d9604f058bf62aedd80a1", + "url": "https://api.github.com/repos/doctrine/migrations/zipball/96cb2a89b56c9efb0bac38e606dc0b0f13e650ec", + "reference": "96cb2a89b56c9efb0bac38e606dc0b0f13e650ec", "shasum": "" }, "require": { @@ -3966,7 +3966,7 @@ ], "support": { "issues": "https://github.com/doctrine/migrations/issues", - "source": "https://github.com/doctrine/migrations/tree/3.9.6" + "source": "https://github.com/doctrine/migrations/tree/3.9.7" }, "funding": [ { @@ -3982,7 +3982,7 @@ "type": "tidelift" } ], - "time": "2026-02-11T06:46:11+00:00" + "time": "2026-04-23T19:33:20+00:00" }, { "name": "doctrine/orm", @@ -4074,19 +4074,20 @@ }, { "name": "doctrine/persistence", - "version": "4.1.1", + "version": "4.2.0", "source": { "type": "git", "url": "https://github.com/doctrine/persistence.git", - "reference": "b9c49ad3558bb77ef973f4e173f2e9c2eca9be09" + "reference": "49ab73e0d3e2ac8d1f5ecda3dd8acd5503781e8b" }, "dist": { "type": "zip", - "url": "https://api.github.com/repos/doctrine/persistence/zipball/b9c49ad3558bb77ef973f4e173f2e9c2eca9be09", - "reference": "b9c49ad3558bb77ef973f4e173f2e9c2eca9be09", + "url": "https://api.github.com/repos/doctrine/persistence/zipball/49ab73e0d3e2ac8d1f5ecda3dd8acd5503781e8b", + "reference": "49ab73e0d3e2ac8d1f5ecda3dd8acd5503781e8b", "shasum": "" }, "require": { + "doctrine/deprecations": "^1", "doctrine/event-manager": "^1 || ^2", "php": "^8.1", "psr/cache": "^1.0 || ^2.0 || ^3.0" @@ -4097,13 +4098,13 @@ "phpstan/phpstan-phpunit": "^2", "phpstan/phpstan-strict-rules": "^2", "phpunit/phpunit": "^10.5.58 || ^12", - "symfony/cache": "^4.4 || ^5.4 || ^6.0 || ^7.0", - "symfony/finder": "^4.4 || ^5.4 || ^6.0 || ^7.0" + "symfony/cache": "^4.4 || ^5.4 || ^6.0 || ^7.0 || ^8.0", + "symfony/finder": "^4.4 || ^5.4 || ^6.0 || ^7.0 || ^8.0" }, "type": "library", "autoload": { "psr-4": { - "Doctrine\\Persistence\\": "src/Persistence" + "Doctrine\\Persistence\\": "src" } }, "notification-url": "https://packagist.org/downloads/", @@ -4147,7 +4148,7 @@ ], "support": { "issues": "https://github.com/doctrine/persistence/issues", - "source": "https://github.com/doctrine/persistence/tree/4.1.1" + "source": "https://github.com/doctrine/persistence/tree/4.2.0" }, "funding": [ { @@ -4163,7 +4164,7 @@ "type": "tidelift" } ], - "time": "2025-10-16T20:13:18+00:00" + "time": "2026-04-26T12:12:52+00:00" }, { "name": "doctrine/sql-formatter", @@ -5534,6 +5535,191 @@ ], "time": "2023-05-21T07:57:08+00:00" }, + { + "name": "jkphl/dom-factory", + "version": "v1.0.1", + "source": { + "type": "git", + "url": "https://github.com/jkphl/dom-factory.git", + "reference": "dd32b8b2cc800f065c0eff8bb621d9f80147d45e" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/jkphl/dom-factory/zipball/dd32b8b2cc800f065c0eff8bb621d9f80147d45e", + "reference": "dd32b8b2cc800f065c0eff8bb621d9f80147d45e", + "shasum": "" + }, + "require": { + "ext-dom": "*", + "ext-libxml": "*", + "ext-mbstring": "*", + "guzzlehttp/guzzle": "^6.0||^7.0", + "masterminds/html5": "^2.7", + "php": ">=7.2" + }, + "require-dev": { + "clue/graph-composer": "^1.1", + "php-coveralls/php-coveralls": "^2.2", + "phpunit/phpunit": "^8.0||^9.0", + "squizlabs/php_codesniffer": "^3.5" + }, + "type": "library", + "autoload": { + "psr-4": { + "Jkphl\\": "src/" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Joschi Kuphal", + "email": "joschi@kuphal.net", + "homepage": "https://jkphl.is", + "role": "Developer" + } + ], + "description": "Simple HTML5/XML DOM factory", + "homepage": "https://github.com/jkphl/dom-factory", + "support": { + "email": "joschi@kuphal.net", + "issues": "https://github.com/jkphl/dom-factory/issues", + "source": "https://github.com/jkphl/dom-factory" + }, + "time": "2021-06-28T11:49:36+00:00" + }, + { + "name": "jkphl/micrometa", + "version": "dev-master", + "source": { + "type": "git", + "url": "https://github.com/jbtronics/micrometa.git", + "reference": "720f409151c2cc20add9478b7a0a635fa1707021" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/jbtronics/micrometa/zipball/720f409151c2cc20add9478b7a0a635fa1707021", + "reference": "720f409151c2cc20add9478b7a0a635fa1707021", + "shasum": "" + }, + "require": { + "ext-dom": "*", + "jkphl/dom-factory": "^1", + "jkphl/rdfa-lite-microdata": "^0.4.4", + "league/uri": "^5.0|^6.5|^7.0", + "mf2/mf2": "^0.4", + "ml/json-ld": "^1.2", + "monolog/monolog": "^1.24 || ^2 || ^3", + "php": ">=7.1.3", + "psr/cache": "^1.0|^2|^3", + "psr/log": "^1.1|^2|^3", + "symfony/cache": "^4.0|^5.0|^6.0|^7.0|^8.0" + }, + "require-dev": { + "clue/graph-composer": "^1.1", + "mf2/tests": "@dev", + "php-coveralls/php-coveralls": "^2.1", + "phpunit/phpunit": "^7.0 || ^8.5", + "squizlabs/php_codesniffer": "^3.3" + }, + "default-branch": true, + "type": "library", + "autoload": { + "psr-4": { + "Jkphl\\": "src/" + } + }, + "scripts": { + "phpunit": [ + "vendor/bin/phpunit --configuration phpunit.xml.dist" + ], + "depgraph": [ + "vendor/bin/graph-composer --no-dev export . doc/dependencies.svg" + ], + "check-style": [ + "vendor/bin/phpcs -p --standard=PSR2 --runtime-set ignore_errors_on_exit 1 --runtime-set ignore_warnings_on_exit 1 src" + ], + "fix-style": [ + "vendor/bin/phpcbf -p --standard=PSR2 --runtime-set ignore_errors_on_exit 1 --runtime-set ignore_warnings_on_exit 1 src" + ], + "test": [ + "@phpunit" + ] + }, + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Joschi Kuphal", + "email": "joschi@tollwerk.de", + "homepage": "https://jkphl.is", + "role": "Developer" + } + ], + "description": "A meta parser for extracting micro information out of web documents, currently supporting Microformats 1+2, HTML Microdata, RDFa Lite 1.1 and JSON-LD", + "homepage": "https://jkphl.is/projects/micrometa/", + "support": { + "email": "joschi@tollwerk.de", + "source": "https://github.com/jkphl/micrometa", + "issues": "https://github.com/jkphl/micrometa/issues" + }, + "time": "2026-04-26T17:25:19+00:00" + }, + { + "name": "jkphl/rdfa-lite-microdata", + "version": "v0.4.7", + "source": { + "type": "git", + "url": "https://github.com/jkphl/rdfa-lite-microdata.git", + "reference": "ffc4940e8be55798257a03da7ed7d4506a13c3e5" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/jkphl/rdfa-lite-microdata/zipball/ffc4940e8be55798257a03da7ed7d4506a13c3e5", + "reference": "ffc4940e8be55798257a03da7ed7d4506a13c3e5", + "shasum": "" + }, + "require": { + "jkphl/dom-factory": "^1", + "php": ">=5.5" + }, + "require-dev": { + "clue/graph-composer": "dev-master", + "codeclimate/php-test-reporter": "^0.4.4", + "phpunit/phpunit": "^4.8", + "satooshi/php-coveralls": "^1.0", + "squizlabs/php_codesniffer": "^2.8" + }, + "type": "library", + "autoload": { + "psr-4": { + "Jkphl\\": "src/" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Joschi Kuphal", + "email": "joschi@tollwerk.de", + "homepage": "https://jkphl.is", + "role": "Developer" + } + ], + "description": "RDFa Lite 1.1 and HTML Microdata parser for web documents (HTML, SVG, XML)", + "homepage": "https://github.com/jkphl/rdfa-lite-microdata", + "support": { + "email": "joschi@tollwerk.de", + "issues": "https://github.com/jkphl/rdfa-lite-microdata/issues", + "source": "https://github.com/jkphl/rdfa-lite-microdata" + }, + "time": "2023-01-27T13:29:45+00:00" + }, { "name": "kelunik/certificate", "version": "v1.1.3", @@ -6899,6 +7085,170 @@ }, "time": "2025-07-25T09:04:22+00:00" }, + { + "name": "mf2/mf2", + "version": "0.4.6", + "source": { + "type": "git", + "url": "https://github.com/microformats/php-mf2.git", + "reference": "00b70ee7eb7f5b0585b1bd467f6c9cbd75055d23" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/microformats/php-mf2/zipball/00b70ee7eb7f5b0585b1bd467f6c9cbd75055d23", + "reference": "00b70ee7eb7f5b0585b1bd467f6c9cbd75055d23", + "shasum": "" + }, + "require": { + "php": ">=5.4.0" + }, + "require-dev": { + "mf2/tests": "@dev", + "phpdocumentor/phpdocumentor": "v2.8.4", + "phpunit/phpunit": "4.8.*" + }, + "suggest": { + "barnabywalters/mf-cleaner": "To more easily handle the canonical data php-mf2 gives you", + "masterminds/html5": "Alternative HTML parser for PHP, for better HTML5 support." + }, + "bin": [ + "bin/fetch-mf2", + "bin/parse-mf2" + ], + "type": "library", + "autoload": { + "files": [ + "Mf2/Parser.php" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "CC0-1.0" + ], + "authors": [ + { + "name": "Barnaby Walters", + "homepage": "http://waterpigs.co.uk" + } + ], + "description": "A pure, generic microformats2 parser — makes HTML as easy to consume as a JSON API", + "keywords": [ + "html", + "microformats", + "microformats 2", + "parser", + "semantic" + ], + "support": { + "issues": "https://github.com/microformats/php-mf2/issues", + "source": "https://github.com/microformats/php-mf2/tree/master" + }, + "time": "2018-08-24T14:47:04+00:00" + }, + { + "name": "ml/iri", + "version": "1.1.4", + "target-dir": "ML/IRI", + "source": { + "type": "git", + "url": "https://github.com/lanthaler/IRI.git", + "reference": "cbd44fa913e00ea624241b38cefaa99da8d71341" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/lanthaler/IRI/zipball/cbd44fa913e00ea624241b38cefaa99da8d71341", + "reference": "cbd44fa913e00ea624241b38cefaa99da8d71341", + "shasum": "" + }, + "require": { + "lib-pcre": ">=4.0", + "php": ">=5.3.0" + }, + "type": "library", + "autoload": { + "psr-0": { + "ML\\IRI": "" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Markus Lanthaler", + "email": "mail@markus-lanthaler.com", + "homepage": "http://www.markus-lanthaler.com", + "role": "Developer" + } + ], + "description": "IRI handling for PHP", + "homepage": "http://www.markus-lanthaler.com", + "keywords": [ + "URN", + "iri", + "uri", + "url" + ], + "support": { + "issues": "https://github.com/lanthaler/IRI/issues", + "source": "https://github.com/lanthaler/IRI/tree/master" + }, + "time": "2014-01-21T13:43:39+00:00" + }, + { + "name": "ml/json-ld", + "version": "1.2.1", + "source": { + "type": "git", + "url": "https://github.com/lanthaler/JsonLD.git", + "reference": "537e68e87a6bce23e57c575cd5dcac1f67ce25d8" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/lanthaler/JsonLD/zipball/537e68e87a6bce23e57c575cd5dcac1f67ce25d8", + "reference": "537e68e87a6bce23e57c575cd5dcac1f67ce25d8", + "shasum": "" + }, + "require": { + "ext-json": "*", + "ml/iri": "^1.1.1", + "php": ">=5.3.0" + }, + "require-dev": { + "json-ld/tests": "1.0", + "phpunit/phpunit": "^4" + }, + "type": "library", + "autoload": { + "psr-4": { + "ML\\JsonLD\\": "" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Markus Lanthaler", + "email": "mail@markus-lanthaler.com", + "homepage": "http://www.markus-lanthaler.com", + "role": "Developer" + } + ], + "description": "JSON-LD Processor for PHP", + "homepage": "http://www.markus-lanthaler.com", + "keywords": [ + "JSON-LD", + "jsonld" + ], + "support": { + "issues": "https://github.com/lanthaler/JsonLD/issues", + "source": "https://github.com/lanthaler/JsonLD/tree/1.2.1" + }, + "time": "2022-09-29T08:45:17+00:00" + }, { "name": "monolog/monolog", "version": "3.10.0", @@ -9409,16 +9759,16 @@ }, { "name": "rhukster/dom-sanitizer", - "version": "1.0.10", + "version": "1.0.11", "source": { "type": "git", "url": "https://github.com/rhukster/dom-sanitizer.git", - "reference": "49a98046b708a4c92f754f5b0ef1720bb85142e2" + "reference": "02d08ec8b36b93b04517d74fe82b715ef06273bd" }, "dist": { "type": "zip", - "url": "https://api.github.com/repos/rhukster/dom-sanitizer/zipball/49a98046b708a4c92f754f5b0ef1720bb85142e2", - "reference": "49a98046b708a4c92f754f5b0ef1720bb85142e2", + "url": "https://api.github.com/repos/rhukster/dom-sanitizer/zipball/02d08ec8b36b93b04517d74fe82b715ef06273bd", + "reference": "02d08ec8b36b93b04517d74fe82b715ef06273bd", "shasum": "" }, "require": { @@ -9448,9 +9798,9 @@ "description": "A simple but effective DOM/SVG/MathML Sanitizer for PHP 7.4+", "support": { "issues": "https://github.com/rhukster/dom-sanitizer/issues", - "source": "https://github.com/rhukster/dom-sanitizer/tree/1.0.10" + "source": "https://github.com/rhukster/dom-sanitizer/tree/1.0.11" }, - "time": "2026-04-10T17:00:11+00:00" + "time": "2026-04-23T22:56:32+00:00" }, { "name": "robrichards/xmlseclibs", @@ -13693,7 +14043,7 @@ }, { "name": "symfony/polyfill-ctype", - "version": "v1.36.0", + "version": "v1.37.0", "source": { "type": "git", "url": "https://github.com/symfony/polyfill-ctype.git", @@ -13752,7 +14102,7 @@ "portable" ], "support": { - "source": "https://github.com/symfony/polyfill-ctype/tree/v1.36.0" + "source": "https://github.com/symfony/polyfill-ctype/tree/v1.37.0" }, "funding": [ { @@ -13776,16 +14126,16 @@ }, { "name": "symfony/polyfill-intl-grapheme", - "version": "v1.36.0", + "version": "v1.37.0", "source": { "type": "git", "url": "https://github.com/symfony/polyfill-intl-grapheme.git", - "reference": "ad1b7b9092976d6c948b8a187cec9faaea9ec1df" + "reference": "4864388bfbd3001ce88e234fab652acd91fdc57e" }, "dist": { "type": "zip", - "url": "https://api.github.com/repos/symfony/polyfill-intl-grapheme/zipball/ad1b7b9092976d6c948b8a187cec9faaea9ec1df", - "reference": "ad1b7b9092976d6c948b8a187cec9faaea9ec1df", + "url": "https://api.github.com/repos/symfony/polyfill-intl-grapheme/zipball/4864388bfbd3001ce88e234fab652acd91fdc57e", + "reference": "4864388bfbd3001ce88e234fab652acd91fdc57e", "shasum": "" }, "require": { @@ -13834,7 +14184,7 @@ "shim" ], "support": { - "source": "https://github.com/symfony/polyfill-intl-grapheme/tree/v1.36.0" + "source": "https://github.com/symfony/polyfill-intl-grapheme/tree/v1.37.0" }, "funding": [ { @@ -13854,11 +14204,11 @@ "type": "tidelift" } ], - "time": "2026-04-10T16:19:22+00:00" + "time": "2026-04-26T13:13:48+00:00" }, { "name": "symfony/polyfill-intl-icu", - "version": "v1.36.0", + "version": "v1.37.0", "source": { "type": "git", "url": "https://github.com/symfony/polyfill-intl-icu.git", @@ -13922,7 +14272,7 @@ "shim" ], "support": { - "source": "https://github.com/symfony/polyfill-intl-icu/tree/v1.36.0" + "source": "https://github.com/symfony/polyfill-intl-icu/tree/v1.37.0" }, "funding": [ { @@ -13946,7 +14296,7 @@ }, { "name": "symfony/polyfill-intl-idn", - "version": "v1.36.0", + "version": "v1.37.0", "source": { "type": "git", "url": "https://github.com/symfony/polyfill-intl-idn.git", @@ -14009,7 +14359,7 @@ "shim" ], "support": { - "source": "https://github.com/symfony/polyfill-intl-idn/tree/v1.36.0" + "source": "https://github.com/symfony/polyfill-intl-idn/tree/v1.37.0" }, "funding": [ { @@ -14033,7 +14383,7 @@ }, { "name": "symfony/polyfill-intl-normalizer", - "version": "v1.36.0", + "version": "v1.37.0", "source": { "type": "git", "url": "https://github.com/symfony/polyfill-intl-normalizer.git", @@ -14094,7 +14444,7 @@ "shim" ], "support": { - "source": "https://github.com/symfony/polyfill-intl-normalizer/tree/v1.36.0" + "source": "https://github.com/symfony/polyfill-intl-normalizer/tree/v1.37.0" }, "funding": [ { @@ -14118,7 +14468,7 @@ }, { "name": "symfony/polyfill-php83", - "version": "v1.36.0", + "version": "v1.37.0", "source": { "type": "git", "url": "https://github.com/symfony/polyfill-php83.git", @@ -14174,7 +14524,7 @@ "shim" ], "support": { - "source": "https://github.com/symfony/polyfill-php83/tree/v1.36.0" + "source": "https://github.com/symfony/polyfill-php83/tree/v1.37.0" }, "funding": [ { @@ -14198,7 +14548,7 @@ }, { "name": "symfony/polyfill-php84", - "version": "v1.36.0", + "version": "v1.37.0", "source": { "type": "git", "url": "https://github.com/symfony/polyfill-php84.git", @@ -14254,7 +14604,7 @@ "shim" ], "support": { - "source": "https://github.com/symfony/polyfill-php84/tree/v1.36.0" + "source": "https://github.com/symfony/polyfill-php84/tree/v1.37.0" }, "funding": [ { @@ -14278,16 +14628,16 @@ }, { "name": "symfony/polyfill-php85", - "version": "v1.36.0", + "version": "v1.37.0", "source": { "type": "git", "url": "https://github.com/symfony/polyfill-php85.git", - "reference": "2c408a6bb0313e6001a83628dc5506100474254e" + "reference": "fcfa4973a9917cef23f2e38774da74a2b7d115ee" }, "dist": { "type": "zip", - "url": "https://api.github.com/repos/symfony/polyfill-php85/zipball/2c408a6bb0313e6001a83628dc5506100474254e", - "reference": "2c408a6bb0313e6001a83628dc5506100474254e", + "url": "https://api.github.com/repos/symfony/polyfill-php85/zipball/fcfa4973a9917cef23f2e38774da74a2b7d115ee", + "reference": "fcfa4973a9917cef23f2e38774da74a2b7d115ee", "shasum": "" }, "require": { @@ -14334,7 +14684,7 @@ "shim" ], "support": { - "source": "https://github.com/symfony/polyfill-php85/tree/v1.36.0" + "source": "https://github.com/symfony/polyfill-php85/tree/v1.37.0" }, "funding": [ { @@ -14354,11 +14704,11 @@ "type": "tidelift" } ], - "time": "2026-04-10T16:50:15+00:00" + "time": "2026-04-26T13:10:57+00:00" }, { "name": "symfony/polyfill-uuid", - "version": "v1.36.0", + "version": "v1.37.0", "source": { "type": "git", "url": "https://github.com/symfony/polyfill-uuid.git", @@ -14417,7 +14767,7 @@ "uuid" ], "support": { - "source": "https://github.com/symfony/polyfill-uuid/tree/v1.36.0" + "source": "https://github.com/symfony/polyfill-uuid/tree/v1.37.0" }, "funding": [ { @@ -19854,12 +20204,12 @@ "source": { "type": "git", "url": "https://github.com/Roave/SecurityAdvisories.git", - "reference": "10b8a93511210c9bae3be31f4fe13c3ff974cad4" + "reference": "08cd07f04fb07fb4d316e956801d57b700cf7096" }, "dist": { "type": "zip", - "url": "https://api.github.com/repos/Roave/SecurityAdvisories/zipball/10b8a93511210c9bae3be31f4fe13c3ff974cad4", - "reference": "10b8a93511210c9bae3be31f4fe13c3ff974cad4", + "url": "https://api.github.com/repos/Roave/SecurityAdvisories/zipball/08cd07f04fb07fb4d316e956801d57b700cf7096", + "reference": "08cd07f04fb07fb4d316e956801d57b700cf7096", "shasum": "" }, "conflict": { @@ -19882,6 +20232,7 @@ "alextselegidis/easyappointments": "<=1.5.2", "alexusmai/laravel-file-manager": "<=3.3.1", "algolia/algoliasearch-magento-2": "<=3.16.1|>=3.17.0.0-beta1,<=3.17.1", + "almirhodzic/nova-toggle-5": "<1.3", "alt-design/alt-redirect": "<1.6.4", "altcha-org/altcha": "<1.3.1", "alterphp/easyadmin-extension-bundle": ">=1.2,<1.2.11|>=1.3,<1.3.1", @@ -19978,7 +20329,7 @@ "ckeditor/ckeditor": "<4.25", "clickstorm/cs-seo": ">=6,<6.8|>=7,<7.5|>=8,<8.4|>=9,<9.3", "co-stack/fal_sftp": "<0.2.6", - "cockpit-hq/cockpit": "<2.13.5", + "cockpit-hq/cockpit": "<2.14", "code16/sharp": "<9.20", "codeception/codeception": "<3.1.3|>=4,<4.1.22", "codeigniter/framework": "<3.1.10", @@ -20141,7 +20492,7 @@ "fisharebest/webtrees": "<=2.1.18", "fixpunkt/fp-masterquiz": "<2.2.1|>=3,<3.5.2", "fixpunkt/fp-newsletter": "<1.1.1|>=1.2,<2.1.2|>=2.2,<3.2.6", - "flarum/core": "<1.8.10", + "flarum/core": "<=1.8.15|>=2.0.0.0-beta1,<=2.0.0.0-beta8", "flarum/flarum": "<0.1.0.0-beta8", "flarum/framework": "<1.8.10", "flarum/mentions": "<1.6.3", @@ -20178,7 +20529,7 @@ "geshi/geshi": "<=1.0.9.1", "getformwork/formwork": "<=2.3.3", "getgrav/grav": "<1.11.0.0-beta1", - "getkirby/cms": "<=5.2.1", + "getkirby/cms": "<5.4", "getkirby/kirby": "<3.9.8.3-dev|>=3.10,<3.10.1.2-dev|>=4,<4.7.1", "getkirby/panel": "<2.5.14", "getkirby/starterkit": "<=3.7.0.2", @@ -20276,7 +20627,7 @@ "kelvinmo/simplexrd": "<3.1.1", "kevinpapst/kimai2": "<1.16.7", "khodakhah/nodcms": "<=3.4.1", - "kimai/kimai": "<=2.53", + "kimai/kimai": "<2.54", "kitodo/presentation": "<3.2.3|>=3.3,<3.3.4", "klaviyo/magento2-extension": ">=1,<3", "knplabs/knp-snappy": "<=1.4.2", @@ -20720,7 +21071,7 @@ "twig/twig": "<3.11.2|>=3.12,<3.14.1|>=3.16,<3.19", "typicms/core": "<16.1.7", "typo3/cms": "<9.5.29|>=10,<10.4.35|>=11,<11.5.23|>=12,<12.2", - "typo3/cms-backend": "<4.1.14|>=4.2,<4.2.15|>=4.3,<4.3.7|>=4.4,<4.4.4|>=7,<=7.6.50|>=8,<=8.7.39|>=9,<9.5.55|>=10,<=10.4.54|>=11,<=11.5.48|>=12,<=12.4.40|>=13,<=13.4.22|>=14,<=14.0.1", + "typo3/cms-backend": "<4.1.14|>=4.2,<4.2.15|>=4.3,<4.3.7|>=4.4,<4.4.4|>=7,<=7.6.50|>=8,<=8.7.39|>=9,<9.5.55|>=10,<=10.4.54|>=11,<=11.5.48|>=12,<=12.4.40|>=13,<=13.4.22|>=14,<=14.0.1|==14.2", "typo3/cms-belog": ">=10,<=10.4.47|>=11,<=11.5.41|>=12,<=12.4.24|>=13,<=13.4.2", "typo3/cms-beuser": ">=9,<9.5.55|>=10,<10.4.54|>=11,<11.5.48|>=12,<12.4.37|>=13,<13.4.18", "typo3/cms-core": "<=8.7.56|>=9,<9.5.55|>=10,<=10.4.54|>=11,<=11.5.48|>=12,<=12.4.40|>=13,<=13.4.22|>=14,<=14.0.1", @@ -20902,7 +21253,7 @@ "type": "tidelift" } ], - "time": "2026-04-22T18:27:19+00:00" + "time": "2026-04-24T17:22:29+00:00" }, { "name": "sebastian/cli-parser", @@ -22418,6 +22769,7 @@ "aliases": [], "minimum-stability": "stable", "stability-flags": { + "jkphl/micrometa": 20, "roave/security-advisories": 20 }, "prefer-stable": false, diff --git a/src/Services/InfoProviderSystem/DTOJsonSchemaConverter.php b/src/Services/InfoProviderSystem/DTOJsonSchemaConverter.php index b9208c87..2d297243 100644 --- a/src/Services/InfoProviderSystem/DTOJsonSchemaConverter.php +++ b/src/Services/InfoProviderSystem/DTOJsonSchemaConverter.php @@ -54,7 +54,8 @@ final class DTOJsonSchemaConverter 'category' => ['type' => ['string', 'null'], 'description' => 'Product category'], 'manufacturing_status' => ['type' => ['string', 'null'], 'enum' => ['active', 'obsolete', 'nrfnd', 'discontinued', null], 'description' => 'Manufacturing status'], 'footprint' => ['type' => ['string', 'null'], 'description' => 'Package/footprint type'], - 'mass' => ['type' => ['number', 'null'], 'description' => 'Mass in grams'], + 'mass' => ['type' => ['number', 'null'], 'description' => 'Mass of the product in grams'], + 'gtin' => ['type' => ['string', 'null'], 'description' => 'Global Trade Item Number (GTIN) / EAN / UPC code'], 'parameters' => [ 'type' => 'array', 'items' => [ @@ -94,17 +95,17 @@ final class DTOJsonSchemaConverter 'items' => [ 'type' => 'object', 'properties' => [ - 'distributor_name' => ['type' => 'string'], - 'order_number' => ['type' => ['string', 'null']], + 'distributor_name' => ['type' => 'string', 'description' => 'Name of the distributor or vendor. Typically the shop name'], + 'order_number' => ['type' => ['string', 'null'], 'description' => 'The order number or SKU used by the distributor. Optional, but can help to find the product on the distributor website.'], 'product_url' => ['type' => 'string'], 'prices' => [ 'type' => 'array', 'items' => [ 'type' => 'object', 'properties' => [ - 'minimum_quantity' => ['type' => 'integer'], - 'price' => ['type' => 'number'], - 'currency' => ['type' => 'string'], + 'minimum_quantity' => ['type' => 'integer', 'description' => 'Minimum quantity for this price tier. 1 when no tiered pricing is available.'], + 'price' => ['type' => 'number', 'description' => 'Price for the given minimum quantity.'], + 'currency' => ['type' => 'string', 'description' => 'Currency ISO code, e.g. USD'], ], 'required' => ['minimum_quantity', 'price', 'currency'], ], @@ -226,6 +227,7 @@ final class DTOJsonSchemaConverter manufacturing_status: $manufacturingStatus, provider_url: $productUrl, footprint: $data['footprint'] ?? null, + gtin: $data['gtin'] ?? null, notes: null, datasheets: $datasheets, images: $images, diff --git a/src/Services/InfoProviderSystem/Providers/AIInfoExtractor.php b/src/Services/InfoProviderSystem/Providers/AIInfoExtractor.php index c8eff0a4..7ae858a6 100644 --- a/src/Services/InfoProviderSystem/Providers/AIInfoExtractor.php +++ b/src/Services/InfoProviderSystem/Providers/AIInfoExtractor.php @@ -29,10 +29,15 @@ use App\Services\AI\AIPlatformRegistry; use App\Services\InfoProviderSystem\DTOJsonSchemaConverter; use App\Services\InfoProviderSystem\DTOs\PartDetailDTO; use App\Settings\InfoProviderSystem\AIExtractorSettings; +use Brick\Schema\SchemaReader; +use Jkphl\Micrometa; +use League\HTMLToMarkdown\HtmlConverter; use Symfony\AI\Platform\Message\Message; use Symfony\AI\Platform\Message\MessageBag; +use Symfony\Component\DomCrawler\Crawler; use Symfony\Contracts\HttpClient\HttpClientInterface; + final class AIInfoExtractor implements InfoProviderInterface { use FixAndValidateUrlTrait; @@ -95,16 +100,56 @@ final class AIInfoExtractor implements InfoProviderInterface $html = $response->getContent(); // Clean HTML - $cleanedHtml = $this->cleanHTML($html); + /*$cleanedHtml = $this->cleanHTML($html); // Truncate to max content length - $truncatedHtml = $this->truncateHTML($cleanedHtml, $this->settings->maxContentLength); + $truncatedHtml = $this->truncateHTML($cleanedHtml, $this->settings->maxContentLength);*/ + + $markdown = $this->htmlToMarkdown($html); + + //Extract structured data using traditional methods, to provide additional context to the LLM. This can help improve accuracy, especially for technical specifications that might be in tables or specific formats. + $structuredData = $this->extractStructuredData($html, $url); // Call LLM - $llmResponse = $this->callLLM($truncatedHtml, $url); + $llmResponse = $this->callLLM($markdown, $url, $structuredData); // Build and return PartDetailDTO - return $this->jsonSchemaConverter->jsonToDTO($llmResponse, $this->getProviderKey(), $url, $url, self::DISTRIBUTOR_NAME); + $result = $this->jsonSchemaConverter->jsonToDTO($llmResponse, $this->getProviderKey(), $url, $url, self::DISTRIBUTOR_NAME); + + return $result; + } + + /** + * Extracts structured data from the HTML using microformats. + * @param string $html + * @param string $url + * @return string JSON encoded structured data + */ + private function extractStructuredData(string $html, string $url): string + { + $micrometa = new Micrometa\Ports\Parser(); + $items = $micrometa($url, $html); + + return json_encode($items->toObject(), JSON_THROW_ON_ERROR); + } + + private function htmlToMarkdown(string $html): string + { + //Extract only the main content of the page to avoid overwhelming the LLM with irrelevant information. + $crawler = new Crawler($html); + $mainContent = $crawler->filter('main, article, #content')->first(); + + // If we found a specific content area, get its HTML; otherwise, use the whole body. + $htmlToConvert = $mainContent->count() ? $mainContent->html() : $html; + + //Concert to markdown + $converter = new HtmlConverter([ + 'strip_tags' => true, // Removes tags that aren't Markdown-compatible (like
) + 'hard_break' => true, // Preserves line breaks + 'remove_nodes' => 'nav footer script style' // Extra safety layer + ]); + + return $converter->convert($htmlToConvert); } public function getCapabilities(): array @@ -160,13 +205,18 @@ final class AIInfoExtractor implements InfoProviderInterface return $truncated; } - private function callLLM(string $htmlContent, string $url): array + private function callLLM(string $htmlContent, string $url, ?string $structuredData = null): array { $input = new MessageBag( Message::forSystem($this->buildSystemPrompt()), Message::ofUser("Extract part information from this webpage content:\n\nURL: $url\n\n$htmlContent") ); + if ($structuredData) { + $input->add(Message::ofUser("Following data was extracted using traditional methods, but might be incomplete or inaccurate. + Enrich it with the actual website data:\n\n".$structuredData)); + } + try { $aiPlatform = $this->AIPlatformRegistry->getPlatform($this->settings->platform ?? throw new \RuntimeException('No AI platform selected') ); @@ -187,29 +237,8 @@ final class AIInfoExtractor implements InfoProviderInterface private function buildSystemPrompt(): string { return <<<'PROMPT' -You are an expert at extracting electronic component information from web pages. Extract structured data in JSON format. - -Return ONLY a valid JSON object with this exact structure: -{ - "name": "string", - "description": "string", - "manufacturer": "string | null", - "mpn": "string | null", - "category": "string | null", - "manufacturing_status": "active|obsolete|nrfnd|discontinued|null", - "footprint": "string | null", - "mass": "number | null (in grams)", - "parameters": [{"name": "string", "value": "string", "unit": "string | null"}], - "datasheets": [{"url": "string", "description": "string"}], - "images": [{"url": "string", "description": "string"}], - "vendor_infos": [{ - "distributor_name": "string", - "order_number": "string | null", - "product_url": "string", - "prices": [{"minimum_quantity": int, "price": number, "currency": "string"}] - }], - "manufacturer_product_url": "string | null" -} +You are an expert at extracting electronic component information from web pages. Extract structured data in JSON format, from markdown extracted from a product page. +Focus on the main content of the page, such as product descriptions, specifications, and tables. Ignore navigation menus, footers, and sidebars. Rules: - manufacturing_status: Use "active", "obsolete", "nrfnd" (not recommended for new designs), "discontinued", or null diff --git a/symfony.lock b/symfony.lock index dde7df36..f8f88675 100644 --- a/symfony.lock +++ b/symfony.lock @@ -441,12 +441,6 @@ "symfony/browser-kit": { "version": "v4.2.3" }, - "symfony/cache": { - "version": "v4.2.3" - }, - "symfony/cache-contracts": { - "version": "v1.1.5" - }, "symfony/config": { "version": "v4.2.3" },