Use markdown as input for the LLM and add extracted microdata separatley

This commit is contained in:
Jan Böhmer 2026-04-26 19:36:03 +02:00
parent af98fc1079
commit 4dbd92ac4d
5 changed files with 481 additions and 97 deletions

View file

@ -33,6 +33,7 @@
"jbtronics/dompdf-font-loader-bundle": "^1.0.0",
"jbtronics/settings-bundle": "^3.0.0",
"jfcherng/php-diff": "^6.14",
"jkphl/micrometa": "dev-master",
"knpuniversity/oauth2-client-bundle": "^2.15",
"league/commonmark": "^2.7",
"league/csv": "^9.8.0",
@ -159,6 +160,12 @@
"App\\Tests\\": "tests/"
}
},
"repositories": [
{
"type": "vcs",
"url": "https://github.com/jbtronics/micrometa"
}
],
"scripts": {
"auto-scripts": {
"cache:clear": "symfony-cmd",

466
composer.lock generated
View file

@ -4,7 +4,7 @@
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies",
"This file is @generated automatically"
],
"content-hash": "699f421ad81f8a1acacf8e2c4af66491",
"content-hash": "7c76e3af5fd042105a3208fdcb300a11",
"packages": [
{
"name": "amphp/amp",
@ -3883,16 +3883,16 @@
},
{
"name": "doctrine/migrations",
"version": "3.9.6",
"version": "3.9.7",
"source": {
"type": "git",
"url": "https://github.com/doctrine/migrations.git",
"reference": "ffd8355cdd8505fc650d9604f058bf62aedd80a1"
"reference": "96cb2a89b56c9efb0bac38e606dc0b0f13e650ec"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/doctrine/migrations/zipball/ffd8355cdd8505fc650d9604f058bf62aedd80a1",
"reference": "ffd8355cdd8505fc650d9604f058bf62aedd80a1",
"url": "https://api.github.com/repos/doctrine/migrations/zipball/96cb2a89b56c9efb0bac38e606dc0b0f13e650ec",
"reference": "96cb2a89b56c9efb0bac38e606dc0b0f13e650ec",
"shasum": ""
},
"require": {
@ -3966,7 +3966,7 @@
],
"support": {
"issues": "https://github.com/doctrine/migrations/issues",
"source": "https://github.com/doctrine/migrations/tree/3.9.6"
"source": "https://github.com/doctrine/migrations/tree/3.9.7"
},
"funding": [
{
@ -3982,7 +3982,7 @@
"type": "tidelift"
}
],
"time": "2026-02-11T06:46:11+00:00"
"time": "2026-04-23T19:33:20+00:00"
},
{
"name": "doctrine/orm",
@ -4074,19 +4074,20 @@
},
{
"name": "doctrine/persistence",
"version": "4.1.1",
"version": "4.2.0",
"source": {
"type": "git",
"url": "https://github.com/doctrine/persistence.git",
"reference": "b9c49ad3558bb77ef973f4e173f2e9c2eca9be09"
"reference": "49ab73e0d3e2ac8d1f5ecda3dd8acd5503781e8b"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/doctrine/persistence/zipball/b9c49ad3558bb77ef973f4e173f2e9c2eca9be09",
"reference": "b9c49ad3558bb77ef973f4e173f2e9c2eca9be09",
"url": "https://api.github.com/repos/doctrine/persistence/zipball/49ab73e0d3e2ac8d1f5ecda3dd8acd5503781e8b",
"reference": "49ab73e0d3e2ac8d1f5ecda3dd8acd5503781e8b",
"shasum": ""
},
"require": {
"doctrine/deprecations": "^1",
"doctrine/event-manager": "^1 || ^2",
"php": "^8.1",
"psr/cache": "^1.0 || ^2.0 || ^3.0"
@ -4097,13 +4098,13 @@
"phpstan/phpstan-phpunit": "^2",
"phpstan/phpstan-strict-rules": "^2",
"phpunit/phpunit": "^10.5.58 || ^12",
"symfony/cache": "^4.4 || ^5.4 || ^6.0 || ^7.0",
"symfony/finder": "^4.4 || ^5.4 || ^6.0 || ^7.0"
"symfony/cache": "^4.4 || ^5.4 || ^6.0 || ^7.0 || ^8.0",
"symfony/finder": "^4.4 || ^5.4 || ^6.0 || ^7.0 || ^8.0"
},
"type": "library",
"autoload": {
"psr-4": {
"Doctrine\\Persistence\\": "src/Persistence"
"Doctrine\\Persistence\\": "src"
}
},
"notification-url": "https://packagist.org/downloads/",
@ -4147,7 +4148,7 @@
],
"support": {
"issues": "https://github.com/doctrine/persistence/issues",
"source": "https://github.com/doctrine/persistence/tree/4.1.1"
"source": "https://github.com/doctrine/persistence/tree/4.2.0"
},
"funding": [
{
@ -4163,7 +4164,7 @@
"type": "tidelift"
}
],
"time": "2025-10-16T20:13:18+00:00"
"time": "2026-04-26T12:12:52+00:00"
},
{
"name": "doctrine/sql-formatter",
@ -5534,6 +5535,191 @@
],
"time": "2023-05-21T07:57:08+00:00"
},
{
"name": "jkphl/dom-factory",
"version": "v1.0.1",
"source": {
"type": "git",
"url": "https://github.com/jkphl/dom-factory.git",
"reference": "dd32b8b2cc800f065c0eff8bb621d9f80147d45e"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/jkphl/dom-factory/zipball/dd32b8b2cc800f065c0eff8bb621d9f80147d45e",
"reference": "dd32b8b2cc800f065c0eff8bb621d9f80147d45e",
"shasum": ""
},
"require": {
"ext-dom": "*",
"ext-libxml": "*",
"ext-mbstring": "*",
"guzzlehttp/guzzle": "^6.0||^7.0",
"masterminds/html5": "^2.7",
"php": ">=7.2"
},
"require-dev": {
"clue/graph-composer": "^1.1",
"php-coveralls/php-coveralls": "^2.2",
"phpunit/phpunit": "^8.0||^9.0",
"squizlabs/php_codesniffer": "^3.5"
},
"type": "library",
"autoload": {
"psr-4": {
"Jkphl\\": "src/"
}
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"MIT"
],
"authors": [
{
"name": "Joschi Kuphal",
"email": "joschi@kuphal.net",
"homepage": "https://jkphl.is",
"role": "Developer"
}
],
"description": "Simple HTML5/XML DOM factory",
"homepage": "https://github.com/jkphl/dom-factory",
"support": {
"email": "joschi@kuphal.net",
"issues": "https://github.com/jkphl/dom-factory/issues",
"source": "https://github.com/jkphl/dom-factory"
},
"time": "2021-06-28T11:49:36+00:00"
},
{
"name": "jkphl/micrometa",
"version": "dev-master",
"source": {
"type": "git",
"url": "https://github.com/jbtronics/micrometa.git",
"reference": "720f409151c2cc20add9478b7a0a635fa1707021"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/jbtronics/micrometa/zipball/720f409151c2cc20add9478b7a0a635fa1707021",
"reference": "720f409151c2cc20add9478b7a0a635fa1707021",
"shasum": ""
},
"require": {
"ext-dom": "*",
"jkphl/dom-factory": "^1",
"jkphl/rdfa-lite-microdata": "^0.4.4",
"league/uri": "^5.0|^6.5|^7.0",
"mf2/mf2": "^0.4",
"ml/json-ld": "^1.2",
"monolog/monolog": "^1.24 || ^2 || ^3",
"php": ">=7.1.3",
"psr/cache": "^1.0|^2|^3",
"psr/log": "^1.1|^2|^3",
"symfony/cache": "^4.0|^5.0|^6.0|^7.0|^8.0"
},
"require-dev": {
"clue/graph-composer": "^1.1",
"mf2/tests": "@dev",
"php-coveralls/php-coveralls": "^2.1",
"phpunit/phpunit": "^7.0 || ^8.5",
"squizlabs/php_codesniffer": "^3.3"
},
"default-branch": true,
"type": "library",
"autoload": {
"psr-4": {
"Jkphl\\": "src/"
}
},
"scripts": {
"phpunit": [
"vendor/bin/phpunit --configuration phpunit.xml.dist"
],
"depgraph": [
"vendor/bin/graph-composer --no-dev export . doc/dependencies.svg"
],
"check-style": [
"vendor/bin/phpcs -p --standard=PSR2 --runtime-set ignore_errors_on_exit 1 --runtime-set ignore_warnings_on_exit 1 src"
],
"fix-style": [
"vendor/bin/phpcbf -p --standard=PSR2 --runtime-set ignore_errors_on_exit 1 --runtime-set ignore_warnings_on_exit 1 src"
],
"test": [
"@phpunit"
]
},
"license": [
"MIT"
],
"authors": [
{
"name": "Joschi Kuphal",
"email": "joschi@tollwerk.de",
"homepage": "https://jkphl.is",
"role": "Developer"
}
],
"description": "A meta parser for extracting micro information out of web documents, currently supporting Microformats 1+2, HTML Microdata, RDFa Lite 1.1 and JSON-LD",
"homepage": "https://jkphl.is/projects/micrometa/",
"support": {
"email": "joschi@tollwerk.de",
"source": "https://github.com/jkphl/micrometa",
"issues": "https://github.com/jkphl/micrometa/issues"
},
"time": "2026-04-26T17:25:19+00:00"
},
{
"name": "jkphl/rdfa-lite-microdata",
"version": "v0.4.7",
"source": {
"type": "git",
"url": "https://github.com/jkphl/rdfa-lite-microdata.git",
"reference": "ffc4940e8be55798257a03da7ed7d4506a13c3e5"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/jkphl/rdfa-lite-microdata/zipball/ffc4940e8be55798257a03da7ed7d4506a13c3e5",
"reference": "ffc4940e8be55798257a03da7ed7d4506a13c3e5",
"shasum": ""
},
"require": {
"jkphl/dom-factory": "^1",
"php": ">=5.5"
},
"require-dev": {
"clue/graph-composer": "dev-master",
"codeclimate/php-test-reporter": "^0.4.4",
"phpunit/phpunit": "^4.8",
"satooshi/php-coveralls": "^1.0",
"squizlabs/php_codesniffer": "^2.8"
},
"type": "library",
"autoload": {
"psr-4": {
"Jkphl\\": "src/"
}
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"MIT"
],
"authors": [
{
"name": "Joschi Kuphal",
"email": "joschi@tollwerk.de",
"homepage": "https://jkphl.is",
"role": "Developer"
}
],
"description": "RDFa Lite 1.1 and HTML Microdata parser for web documents (HTML, SVG, XML)",
"homepage": "https://github.com/jkphl/rdfa-lite-microdata",
"support": {
"email": "joschi@tollwerk.de",
"issues": "https://github.com/jkphl/rdfa-lite-microdata/issues",
"source": "https://github.com/jkphl/rdfa-lite-microdata"
},
"time": "2023-01-27T13:29:45+00:00"
},
{
"name": "kelunik/certificate",
"version": "v1.1.3",
@ -6899,6 +7085,170 @@
},
"time": "2025-07-25T09:04:22+00:00"
},
{
"name": "mf2/mf2",
"version": "0.4.6",
"source": {
"type": "git",
"url": "https://github.com/microformats/php-mf2.git",
"reference": "00b70ee7eb7f5b0585b1bd467f6c9cbd75055d23"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/microformats/php-mf2/zipball/00b70ee7eb7f5b0585b1bd467f6c9cbd75055d23",
"reference": "00b70ee7eb7f5b0585b1bd467f6c9cbd75055d23",
"shasum": ""
},
"require": {
"php": ">=5.4.0"
},
"require-dev": {
"mf2/tests": "@dev",
"phpdocumentor/phpdocumentor": "v2.8.4",
"phpunit/phpunit": "4.8.*"
},
"suggest": {
"barnabywalters/mf-cleaner": "To more easily handle the canonical data php-mf2 gives you",
"masterminds/html5": "Alternative HTML parser for PHP, for better HTML5 support."
},
"bin": [
"bin/fetch-mf2",
"bin/parse-mf2"
],
"type": "library",
"autoload": {
"files": [
"Mf2/Parser.php"
]
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"CC0-1.0"
],
"authors": [
{
"name": "Barnaby Walters",
"homepage": "http://waterpigs.co.uk"
}
],
"description": "A pure, generic microformats2 parser — makes HTML as easy to consume as a JSON API",
"keywords": [
"html",
"microformats",
"microformats 2",
"parser",
"semantic"
],
"support": {
"issues": "https://github.com/microformats/php-mf2/issues",
"source": "https://github.com/microformats/php-mf2/tree/master"
},
"time": "2018-08-24T14:47:04+00:00"
},
{
"name": "ml/iri",
"version": "1.1.4",
"target-dir": "ML/IRI",
"source": {
"type": "git",
"url": "https://github.com/lanthaler/IRI.git",
"reference": "cbd44fa913e00ea624241b38cefaa99da8d71341"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/lanthaler/IRI/zipball/cbd44fa913e00ea624241b38cefaa99da8d71341",
"reference": "cbd44fa913e00ea624241b38cefaa99da8d71341",
"shasum": ""
},
"require": {
"lib-pcre": ">=4.0",
"php": ">=5.3.0"
},
"type": "library",
"autoload": {
"psr-0": {
"ML\\IRI": ""
}
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"MIT"
],
"authors": [
{
"name": "Markus Lanthaler",
"email": "mail@markus-lanthaler.com",
"homepage": "http://www.markus-lanthaler.com",
"role": "Developer"
}
],
"description": "IRI handling for PHP",
"homepage": "http://www.markus-lanthaler.com",
"keywords": [
"URN",
"iri",
"uri",
"url"
],
"support": {
"issues": "https://github.com/lanthaler/IRI/issues",
"source": "https://github.com/lanthaler/IRI/tree/master"
},
"time": "2014-01-21T13:43:39+00:00"
},
{
"name": "ml/json-ld",
"version": "1.2.1",
"source": {
"type": "git",
"url": "https://github.com/lanthaler/JsonLD.git",
"reference": "537e68e87a6bce23e57c575cd5dcac1f67ce25d8"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/lanthaler/JsonLD/zipball/537e68e87a6bce23e57c575cd5dcac1f67ce25d8",
"reference": "537e68e87a6bce23e57c575cd5dcac1f67ce25d8",
"shasum": ""
},
"require": {
"ext-json": "*",
"ml/iri": "^1.1.1",
"php": ">=5.3.0"
},
"require-dev": {
"json-ld/tests": "1.0",
"phpunit/phpunit": "^4"
},
"type": "library",
"autoload": {
"psr-4": {
"ML\\JsonLD\\": ""
}
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"MIT"
],
"authors": [
{
"name": "Markus Lanthaler",
"email": "mail@markus-lanthaler.com",
"homepage": "http://www.markus-lanthaler.com",
"role": "Developer"
}
],
"description": "JSON-LD Processor for PHP",
"homepage": "http://www.markus-lanthaler.com",
"keywords": [
"JSON-LD",
"jsonld"
],
"support": {
"issues": "https://github.com/lanthaler/JsonLD/issues",
"source": "https://github.com/lanthaler/JsonLD/tree/1.2.1"
},
"time": "2022-09-29T08:45:17+00:00"
},
{
"name": "monolog/monolog",
"version": "3.10.0",
@ -9409,16 +9759,16 @@
},
{
"name": "rhukster/dom-sanitizer",
"version": "1.0.10",
"version": "1.0.11",
"source": {
"type": "git",
"url": "https://github.com/rhukster/dom-sanitizer.git",
"reference": "49a98046b708a4c92f754f5b0ef1720bb85142e2"
"reference": "02d08ec8b36b93b04517d74fe82b715ef06273bd"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/rhukster/dom-sanitizer/zipball/49a98046b708a4c92f754f5b0ef1720bb85142e2",
"reference": "49a98046b708a4c92f754f5b0ef1720bb85142e2",
"url": "https://api.github.com/repos/rhukster/dom-sanitizer/zipball/02d08ec8b36b93b04517d74fe82b715ef06273bd",
"reference": "02d08ec8b36b93b04517d74fe82b715ef06273bd",
"shasum": ""
},
"require": {
@ -9448,9 +9798,9 @@
"description": "A simple but effective DOM/SVG/MathML Sanitizer for PHP 7.4+",
"support": {
"issues": "https://github.com/rhukster/dom-sanitizer/issues",
"source": "https://github.com/rhukster/dom-sanitizer/tree/1.0.10"
"source": "https://github.com/rhukster/dom-sanitizer/tree/1.0.11"
},
"time": "2026-04-10T17:00:11+00:00"
"time": "2026-04-23T22:56:32+00:00"
},
{
"name": "robrichards/xmlseclibs",
@ -13693,7 +14043,7 @@
},
{
"name": "symfony/polyfill-ctype",
"version": "v1.36.0",
"version": "v1.37.0",
"source": {
"type": "git",
"url": "https://github.com/symfony/polyfill-ctype.git",
@ -13752,7 +14102,7 @@
"portable"
],
"support": {
"source": "https://github.com/symfony/polyfill-ctype/tree/v1.36.0"
"source": "https://github.com/symfony/polyfill-ctype/tree/v1.37.0"
},
"funding": [
{
@ -13776,16 +14126,16 @@
},
{
"name": "symfony/polyfill-intl-grapheme",
"version": "v1.36.0",
"version": "v1.37.0",
"source": {
"type": "git",
"url": "https://github.com/symfony/polyfill-intl-grapheme.git",
"reference": "ad1b7b9092976d6c948b8a187cec9faaea9ec1df"
"reference": "4864388bfbd3001ce88e234fab652acd91fdc57e"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/symfony/polyfill-intl-grapheme/zipball/ad1b7b9092976d6c948b8a187cec9faaea9ec1df",
"reference": "ad1b7b9092976d6c948b8a187cec9faaea9ec1df",
"url": "https://api.github.com/repos/symfony/polyfill-intl-grapheme/zipball/4864388bfbd3001ce88e234fab652acd91fdc57e",
"reference": "4864388bfbd3001ce88e234fab652acd91fdc57e",
"shasum": ""
},
"require": {
@ -13834,7 +14184,7 @@
"shim"
],
"support": {
"source": "https://github.com/symfony/polyfill-intl-grapheme/tree/v1.36.0"
"source": "https://github.com/symfony/polyfill-intl-grapheme/tree/v1.37.0"
},
"funding": [
{
@ -13854,11 +14204,11 @@
"type": "tidelift"
}
],
"time": "2026-04-10T16:19:22+00:00"
"time": "2026-04-26T13:13:48+00:00"
},
{
"name": "symfony/polyfill-intl-icu",
"version": "v1.36.0",
"version": "v1.37.0",
"source": {
"type": "git",
"url": "https://github.com/symfony/polyfill-intl-icu.git",
@ -13922,7 +14272,7 @@
"shim"
],
"support": {
"source": "https://github.com/symfony/polyfill-intl-icu/tree/v1.36.0"
"source": "https://github.com/symfony/polyfill-intl-icu/tree/v1.37.0"
},
"funding": [
{
@ -13946,7 +14296,7 @@
},
{
"name": "symfony/polyfill-intl-idn",
"version": "v1.36.0",
"version": "v1.37.0",
"source": {
"type": "git",
"url": "https://github.com/symfony/polyfill-intl-idn.git",
@ -14009,7 +14359,7 @@
"shim"
],
"support": {
"source": "https://github.com/symfony/polyfill-intl-idn/tree/v1.36.0"
"source": "https://github.com/symfony/polyfill-intl-idn/tree/v1.37.0"
},
"funding": [
{
@ -14033,7 +14383,7 @@
},
{
"name": "symfony/polyfill-intl-normalizer",
"version": "v1.36.0",
"version": "v1.37.0",
"source": {
"type": "git",
"url": "https://github.com/symfony/polyfill-intl-normalizer.git",
@ -14094,7 +14444,7 @@
"shim"
],
"support": {
"source": "https://github.com/symfony/polyfill-intl-normalizer/tree/v1.36.0"
"source": "https://github.com/symfony/polyfill-intl-normalizer/tree/v1.37.0"
},
"funding": [
{
@ -14118,7 +14468,7 @@
},
{
"name": "symfony/polyfill-php83",
"version": "v1.36.0",
"version": "v1.37.0",
"source": {
"type": "git",
"url": "https://github.com/symfony/polyfill-php83.git",
@ -14174,7 +14524,7 @@
"shim"
],
"support": {
"source": "https://github.com/symfony/polyfill-php83/tree/v1.36.0"
"source": "https://github.com/symfony/polyfill-php83/tree/v1.37.0"
},
"funding": [
{
@ -14198,7 +14548,7 @@
},
{
"name": "symfony/polyfill-php84",
"version": "v1.36.0",
"version": "v1.37.0",
"source": {
"type": "git",
"url": "https://github.com/symfony/polyfill-php84.git",
@ -14254,7 +14604,7 @@
"shim"
],
"support": {
"source": "https://github.com/symfony/polyfill-php84/tree/v1.36.0"
"source": "https://github.com/symfony/polyfill-php84/tree/v1.37.0"
},
"funding": [
{
@ -14278,16 +14628,16 @@
},
{
"name": "symfony/polyfill-php85",
"version": "v1.36.0",
"version": "v1.37.0",
"source": {
"type": "git",
"url": "https://github.com/symfony/polyfill-php85.git",
"reference": "2c408a6bb0313e6001a83628dc5506100474254e"
"reference": "fcfa4973a9917cef23f2e38774da74a2b7d115ee"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/symfony/polyfill-php85/zipball/2c408a6bb0313e6001a83628dc5506100474254e",
"reference": "2c408a6bb0313e6001a83628dc5506100474254e",
"url": "https://api.github.com/repos/symfony/polyfill-php85/zipball/fcfa4973a9917cef23f2e38774da74a2b7d115ee",
"reference": "fcfa4973a9917cef23f2e38774da74a2b7d115ee",
"shasum": ""
},
"require": {
@ -14334,7 +14684,7 @@
"shim"
],
"support": {
"source": "https://github.com/symfony/polyfill-php85/tree/v1.36.0"
"source": "https://github.com/symfony/polyfill-php85/tree/v1.37.0"
},
"funding": [
{
@ -14354,11 +14704,11 @@
"type": "tidelift"
}
],
"time": "2026-04-10T16:50:15+00:00"
"time": "2026-04-26T13:10:57+00:00"
},
{
"name": "symfony/polyfill-uuid",
"version": "v1.36.0",
"version": "v1.37.0",
"source": {
"type": "git",
"url": "https://github.com/symfony/polyfill-uuid.git",
@ -14417,7 +14767,7 @@
"uuid"
],
"support": {
"source": "https://github.com/symfony/polyfill-uuid/tree/v1.36.0"
"source": "https://github.com/symfony/polyfill-uuid/tree/v1.37.0"
},
"funding": [
{
@ -19854,12 +20204,12 @@
"source": {
"type": "git",
"url": "https://github.com/Roave/SecurityAdvisories.git",
"reference": "10b8a93511210c9bae3be31f4fe13c3ff974cad4"
"reference": "08cd07f04fb07fb4d316e956801d57b700cf7096"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/Roave/SecurityAdvisories/zipball/10b8a93511210c9bae3be31f4fe13c3ff974cad4",
"reference": "10b8a93511210c9bae3be31f4fe13c3ff974cad4",
"url": "https://api.github.com/repos/Roave/SecurityAdvisories/zipball/08cd07f04fb07fb4d316e956801d57b700cf7096",
"reference": "08cd07f04fb07fb4d316e956801d57b700cf7096",
"shasum": ""
},
"conflict": {
@ -19882,6 +20232,7 @@
"alextselegidis/easyappointments": "<=1.5.2",
"alexusmai/laravel-file-manager": "<=3.3.1",
"algolia/algoliasearch-magento-2": "<=3.16.1|>=3.17.0.0-beta1,<=3.17.1",
"almirhodzic/nova-toggle-5": "<1.3",
"alt-design/alt-redirect": "<1.6.4",
"altcha-org/altcha": "<1.3.1",
"alterphp/easyadmin-extension-bundle": ">=1.2,<1.2.11|>=1.3,<1.3.1",
@ -19978,7 +20329,7 @@
"ckeditor/ckeditor": "<4.25",
"clickstorm/cs-seo": ">=6,<6.8|>=7,<7.5|>=8,<8.4|>=9,<9.3",
"co-stack/fal_sftp": "<0.2.6",
"cockpit-hq/cockpit": "<2.13.5",
"cockpit-hq/cockpit": "<2.14",
"code16/sharp": "<9.20",
"codeception/codeception": "<3.1.3|>=4,<4.1.22",
"codeigniter/framework": "<3.1.10",
@ -20141,7 +20492,7 @@
"fisharebest/webtrees": "<=2.1.18",
"fixpunkt/fp-masterquiz": "<2.2.1|>=3,<3.5.2",
"fixpunkt/fp-newsletter": "<1.1.1|>=1.2,<2.1.2|>=2.2,<3.2.6",
"flarum/core": "<1.8.10",
"flarum/core": "<=1.8.15|>=2.0.0.0-beta1,<=2.0.0.0-beta8",
"flarum/flarum": "<0.1.0.0-beta8",
"flarum/framework": "<1.8.10",
"flarum/mentions": "<1.6.3",
@ -20178,7 +20529,7 @@
"geshi/geshi": "<=1.0.9.1",
"getformwork/formwork": "<=2.3.3",
"getgrav/grav": "<1.11.0.0-beta1",
"getkirby/cms": "<=5.2.1",
"getkirby/cms": "<5.4",
"getkirby/kirby": "<3.9.8.3-dev|>=3.10,<3.10.1.2-dev|>=4,<4.7.1",
"getkirby/panel": "<2.5.14",
"getkirby/starterkit": "<=3.7.0.2",
@ -20276,7 +20627,7 @@
"kelvinmo/simplexrd": "<3.1.1",
"kevinpapst/kimai2": "<1.16.7",
"khodakhah/nodcms": "<=3.4.1",
"kimai/kimai": "<=2.53",
"kimai/kimai": "<2.54",
"kitodo/presentation": "<3.2.3|>=3.3,<3.3.4",
"klaviyo/magento2-extension": ">=1,<3",
"knplabs/knp-snappy": "<=1.4.2",
@ -20720,7 +21071,7 @@
"twig/twig": "<3.11.2|>=3.12,<3.14.1|>=3.16,<3.19",
"typicms/core": "<16.1.7",
"typo3/cms": "<9.5.29|>=10,<10.4.35|>=11,<11.5.23|>=12,<12.2",
"typo3/cms-backend": "<4.1.14|>=4.2,<4.2.15|>=4.3,<4.3.7|>=4.4,<4.4.4|>=7,<=7.6.50|>=8,<=8.7.39|>=9,<9.5.55|>=10,<=10.4.54|>=11,<=11.5.48|>=12,<=12.4.40|>=13,<=13.4.22|>=14,<=14.0.1",
"typo3/cms-backend": "<4.1.14|>=4.2,<4.2.15|>=4.3,<4.3.7|>=4.4,<4.4.4|>=7,<=7.6.50|>=8,<=8.7.39|>=9,<9.5.55|>=10,<=10.4.54|>=11,<=11.5.48|>=12,<=12.4.40|>=13,<=13.4.22|>=14,<=14.0.1|==14.2",
"typo3/cms-belog": ">=10,<=10.4.47|>=11,<=11.5.41|>=12,<=12.4.24|>=13,<=13.4.2",
"typo3/cms-beuser": ">=9,<9.5.55|>=10,<10.4.54|>=11,<11.5.48|>=12,<12.4.37|>=13,<13.4.18",
"typo3/cms-core": "<=8.7.56|>=9,<9.5.55|>=10,<=10.4.54|>=11,<=11.5.48|>=12,<=12.4.40|>=13,<=13.4.22|>=14,<=14.0.1",
@ -20902,7 +21253,7 @@
"type": "tidelift"
}
],
"time": "2026-04-22T18:27:19+00:00"
"time": "2026-04-24T17:22:29+00:00"
},
{
"name": "sebastian/cli-parser",
@ -22418,6 +22769,7 @@
"aliases": [],
"minimum-stability": "stable",
"stability-flags": {
"jkphl/micrometa": 20,
"roave/security-advisories": 20
},
"prefer-stable": false,

View file

@ -54,7 +54,8 @@ final class DTOJsonSchemaConverter
'category' => ['type' => ['string', 'null'], 'description' => 'Product category'],
'manufacturing_status' => ['type' => ['string', 'null'], 'enum' => ['active', 'obsolete', 'nrfnd', 'discontinued', null], 'description' => 'Manufacturing status'],
'footprint' => ['type' => ['string', 'null'], 'description' => 'Package/footprint type'],
'mass' => ['type' => ['number', 'null'], 'description' => 'Mass in grams'],
'mass' => ['type' => ['number', 'null'], 'description' => 'Mass of the product in grams'],
'gtin' => ['type' => ['string', 'null'], 'description' => 'Global Trade Item Number (GTIN) / EAN / UPC code'],
'parameters' => [
'type' => 'array',
'items' => [
@ -94,17 +95,17 @@ final class DTOJsonSchemaConverter
'items' => [
'type' => 'object',
'properties' => [
'distributor_name' => ['type' => 'string'],
'order_number' => ['type' => ['string', 'null']],
'distributor_name' => ['type' => 'string', 'description' => 'Name of the distributor or vendor. Typically the shop name'],
'order_number' => ['type' => ['string', 'null'], 'description' => 'The order number or SKU used by the distributor. Optional, but can help to find the product on the distributor website.'],
'product_url' => ['type' => 'string'],
'prices' => [
'type' => 'array',
'items' => [
'type' => 'object',
'properties' => [
'minimum_quantity' => ['type' => 'integer'],
'price' => ['type' => 'number'],
'currency' => ['type' => 'string'],
'minimum_quantity' => ['type' => 'integer', 'description' => 'Minimum quantity for this price tier. 1 when no tiered pricing is available.'],
'price' => ['type' => 'number', 'description' => 'Price for the given minimum quantity.'],
'currency' => ['type' => 'string', 'description' => 'Currency ISO code, e.g. USD'],
],
'required' => ['minimum_quantity', 'price', 'currency'],
],
@ -226,6 +227,7 @@ final class DTOJsonSchemaConverter
manufacturing_status: $manufacturingStatus,
provider_url: $productUrl,
footprint: $data['footprint'] ?? null,
gtin: $data['gtin'] ?? null,
notes: null,
datasheets: $datasheets,
images: $images,

View file

@ -29,10 +29,15 @@ use App\Services\AI\AIPlatformRegistry;
use App\Services\InfoProviderSystem\DTOJsonSchemaConverter;
use App\Services\InfoProviderSystem\DTOs\PartDetailDTO;
use App\Settings\InfoProviderSystem\AIExtractorSettings;
use Brick\Schema\SchemaReader;
use Jkphl\Micrometa;
use League\HTMLToMarkdown\HtmlConverter;
use Symfony\AI\Platform\Message\Message;
use Symfony\AI\Platform\Message\MessageBag;
use Symfony\Component\DomCrawler\Crawler;
use Symfony\Contracts\HttpClient\HttpClientInterface;
final class AIInfoExtractor implements InfoProviderInterface
{
use FixAndValidateUrlTrait;
@ -95,16 +100,56 @@ final class AIInfoExtractor implements InfoProviderInterface
$html = $response->getContent();
// Clean HTML
$cleanedHtml = $this->cleanHTML($html);
/*$cleanedHtml = $this->cleanHTML($html);
// Truncate to max content length
$truncatedHtml = $this->truncateHTML($cleanedHtml, $this->settings->maxContentLength);
$truncatedHtml = $this->truncateHTML($cleanedHtml, $this->settings->maxContentLength);*/
$markdown = $this->htmlToMarkdown($html);
//Extract structured data using traditional methods, to provide additional context to the LLM. This can help improve accuracy, especially for technical specifications that might be in tables or specific formats.
$structuredData = $this->extractStructuredData($html, $url);
// Call LLM
$llmResponse = $this->callLLM($truncatedHtml, $url);
$llmResponse = $this->callLLM($markdown, $url, $structuredData);
// Build and return PartDetailDTO
return $this->jsonSchemaConverter->jsonToDTO($llmResponse, $this->getProviderKey(), $url, $url, self::DISTRIBUTOR_NAME);
$result = $this->jsonSchemaConverter->jsonToDTO($llmResponse, $this->getProviderKey(), $url, $url, self::DISTRIBUTOR_NAME);
return $result;
}
/**
* Extracts structured data from the HTML using microformats.
* @param string $html
* @param string $url
* @return string JSON encoded structured data
*/
private function extractStructuredData(string $html, string $url): string
{
$micrometa = new Micrometa\Ports\Parser();
$items = $micrometa($url, $html);
return json_encode($items->toObject(), JSON_THROW_ON_ERROR);
}
private function htmlToMarkdown(string $html): string
{
//Extract only the main content of the page to avoid overwhelming the LLM with irrelevant information.
$crawler = new Crawler($html);
$mainContent = $crawler->filter('main, article, #content')->first();
// If we found a specific content area, get its HTML; otherwise, use the whole body.
$htmlToConvert = $mainContent->count() ? $mainContent->html() : $html;
//Concert to markdown
$converter = new HtmlConverter([
'strip_tags' => true, // Removes tags that aren't Markdown-compatible (like <div>)
'hard_break' => true, // Preserves line breaks
'remove_nodes' => 'nav footer script style' // Extra safety layer
]);
return $converter->convert($htmlToConvert);
}
public function getCapabilities(): array
@ -160,13 +205,18 @@ final class AIInfoExtractor implements InfoProviderInterface
return $truncated;
}
private function callLLM(string $htmlContent, string $url): array
private function callLLM(string $htmlContent, string $url, ?string $structuredData = null): array
{
$input = new MessageBag(
Message::forSystem($this->buildSystemPrompt()),
Message::ofUser("Extract part information from this webpage content:\n\nURL: $url\n\n$htmlContent")
);
if ($structuredData) {
$input->add(Message::ofUser("Following data was extracted using traditional methods, but might be incomplete or inaccurate.
Enrich it with the actual website data:\n\n".$structuredData));
}
try {
$aiPlatform = $this->AIPlatformRegistry->getPlatform($this->settings->platform ?? throw new \RuntimeException('No AI platform selected') );
@ -187,29 +237,8 @@ final class AIInfoExtractor implements InfoProviderInterface
private function buildSystemPrompt(): string
{
return <<<'PROMPT'
You are an expert at extracting electronic component information from web pages. Extract structured data in JSON format.
Return ONLY a valid JSON object with this exact structure:
{
"name": "string",
"description": "string",
"manufacturer": "string | null",
"mpn": "string | null",
"category": "string | null",
"manufacturing_status": "active|obsolete|nrfnd|discontinued|null",
"footprint": "string | null",
"mass": "number | null (in grams)",
"parameters": [{"name": "string", "value": "string", "unit": "string | null"}],
"datasheets": [{"url": "string", "description": "string"}],
"images": [{"url": "string", "description": "string"}],
"vendor_infos": [{
"distributor_name": "string",
"order_number": "string | null",
"product_url": "string",
"prices": [{"minimum_quantity": int, "price": number, "currency": "string"}]
}],
"manufacturer_product_url": "string | null"
}
You are an expert at extracting electronic component information from web pages. Extract structured data in JSON format, from markdown extracted from a product page.
Focus on the main content of the page, such as product descriptions, specifications, and tables. Ignore navigation menus, footers, and sidebars.
Rules:
- manufacturing_status: Use "active", "obsolete", "nrfnd" (not recommended for new designs), "discontinued", or null

View file

@ -441,12 +441,6 @@
"symfony/browser-kit": {
"version": "v4.2.3"
},
"symfony/cache": {
"version": "v4.2.3"
},
"symfony/cache-contracts": {
"version": "v1.1.5"
},
"symfony/config": {
"version": "v4.2.3"
},