Merge branch 'generic_webshop'
Some checks failed
Build assets artifact / Build assets artifact (push) Has been cancelled
Docker Image Build / docker (push) Has been cancelled
Docker Image Build (FrankenPHP) / docker (push) Has been cancelled
Static analysis / Static analysis (push) Has been cancelled
PHPUnit Tests / PHPUnit and coverage Test (PHP 8.2, mysql) (push) Has been cancelled
PHPUnit Tests / PHPUnit and coverage Test (PHP 8.3, mysql) (push) Has been cancelled
PHPUnit Tests / PHPUnit and coverage Test (PHP 8.4, mysql) (push) Has been cancelled
PHPUnit Tests / PHPUnit and coverage Test (PHP 8.5, mysql) (push) Has been cancelled
PHPUnit Tests / PHPUnit and coverage Test (PHP 8.2, postgres) (push) Has been cancelled
PHPUnit Tests / PHPUnit and coverage Test (PHP 8.3, postgres) (push) Has been cancelled
PHPUnit Tests / PHPUnit and coverage Test (PHP 8.4, postgres) (push) Has been cancelled
PHPUnit Tests / PHPUnit and coverage Test (PHP 8.5, postgres) (push) Has been cancelled
PHPUnit Tests / PHPUnit and coverage Test (PHP 8.2, sqlite) (push) Has been cancelled
PHPUnit Tests / PHPUnit and coverage Test (PHP 8.3, sqlite) (push) Has been cancelled
PHPUnit Tests / PHPUnit and coverage Test (PHP 8.4, sqlite) (push) Has been cancelled
PHPUnit Tests / PHPUnit and coverage Test (PHP 8.5, sqlite) (push) Has been cancelled

This commit is contained in:
Jan Böhmer 2026-02-03 23:20:17 +01:00
commit 851055bdb4
3 changed files with 290 additions and 83 deletions

View file

@ -18,6 +18,7 @@
"api-platform/symfony": "^4.0.0",
"beberlei/doctrineextensions": "^1.2",
"brick/math": "^0.13.1",
"brick/schema": "^0.2.0",
"composer/ca-bundle": "^1.5",
"composer/package-versions-deprecated": "^1.11.99.5",
"doctrine/data-fixtures": "^2.0.0",

173
composer.lock generated
View file

@ -4,7 +4,7 @@
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies",
"This file is @generated automatically"
],
"content-hash": "8e387d6d016f33eb7302c47ecb7a12b9",
"content-hash": "7ca9c95fb85f6bf3d9b8a3aa98ca33f6",
"packages": [
{
"name": "amphp/amp",
@ -2387,6 +2387,117 @@
],
"time": "2025-03-29T13:50:30+00:00"
},
{
"name": "brick/schema",
"version": "0.2.0",
"source": {
"type": "git",
"url": "https://github.com/brick/schema.git",
"reference": "b5114bf5e8092430041a37efe1cfd5279ca764c0"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/brick/schema/zipball/b5114bf5e8092430041a37efe1cfd5279ca764c0",
"reference": "b5114bf5e8092430041a37efe1cfd5279ca764c0",
"shasum": ""
},
"require": {
"brick/structured-data": "~0.1.0 || ~0.2.0",
"ext-dom": "*",
"php": "^8.1"
},
"require-dev": {
"brick/varexporter": "^0.6",
"vimeo/psalm": "6.12.0"
},
"type": "library",
"autoload": {
"psr-4": {
"Brick\\Schema\\": "src/"
}
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"MIT"
],
"description": "Schema.org library for PHP",
"keywords": [
"JSON-LD",
"brick",
"microdata",
"rdfa lite",
"schema",
"schema.org",
"structured data"
],
"support": {
"issues": "https://github.com/brick/schema/issues",
"source": "https://github.com/brick/schema/tree/0.2.0"
},
"funding": [
{
"url": "https://github.com/BenMorel",
"type": "github"
}
],
"time": "2025-06-12T07:03:20+00:00"
},
{
"name": "brick/structured-data",
"version": "0.2.0",
"source": {
"type": "git",
"url": "https://github.com/brick/structured-data.git",
"reference": "be9b28720e2aba87f19c90500700970be85affde"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/brick/structured-data/zipball/be9b28720e2aba87f19c90500700970be85affde",
"reference": "be9b28720e2aba87f19c90500700970be85affde",
"shasum": ""
},
"require": {
"ext-dom": "*",
"ext-json": "*",
"ext-libxml": "*",
"php": "^8.1",
"sabre/uri": "^2.1 || ^3.0"
},
"require-dev": {
"php-coveralls/php-coveralls": "^2.0",
"phpunit/phpunit": "^8.0 || ^9.0",
"vimeo/psalm": "6.12.0"
},
"type": "library",
"autoload": {
"psr-4": {
"Brick\\StructuredData\\": "src/"
}
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"MIT"
],
"description": "Microdata, RDFa Lite & JSON-LD structured data reader",
"keywords": [
"JSON-LD",
"brick",
"microdata",
"rdfa",
"structured data"
],
"support": {
"issues": "https://github.com/brick/structured-data/issues",
"source": "https://github.com/brick/structured-data/tree/0.2.0"
},
"funding": [
{
"url": "https://github.com/BenMorel",
"type": "github"
}
],
"time": "2025-06-10T23:48:46+00:00"
},
{
"name": "composer/ca-bundle",
"version": "1.5.10",
@ -9595,6 +9706,66 @@
},
"time": "2025-09-14T07:37:21+00:00"
},
{
"name": "sabre/uri",
"version": "3.0.2",
"source": {
"type": "git",
"url": "https://github.com/sabre-io/uri.git",
"reference": "38eeab6ed9eec435a2188db489d4649c56272c51"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/sabre-io/uri/zipball/38eeab6ed9eec435a2188db489d4649c56272c51",
"reference": "38eeab6ed9eec435a2188db489d4649c56272c51",
"shasum": ""
},
"require": {
"php": "^7.4 || ^8.0"
},
"require-dev": {
"friendsofphp/php-cs-fixer": "^3.64",
"phpstan/extension-installer": "^1.4",
"phpstan/phpstan": "^1.12",
"phpstan/phpstan-phpunit": "^1.4",
"phpstan/phpstan-strict-rules": "^1.6",
"phpunit/phpunit": "^9.6"
},
"type": "library",
"autoload": {
"files": [
"lib/functions.php"
],
"psr-4": {
"Sabre\\Uri\\": "lib/"
}
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"BSD-3-Clause"
],
"authors": [
{
"name": "Evert Pot",
"email": "me@evertpot.com",
"homepage": "http://evertpot.com/",
"role": "Developer"
}
],
"description": "Functions for making sense out of URIs.",
"homepage": "http://sabre.io/uri/",
"keywords": [
"rfc3986",
"uri",
"url"
],
"support": {
"forum": "https://groups.google.com/group/sabredav-discuss",
"issues": "https://github.com/sabre-io/uri/issues",
"source": "https://github.com/fruux/sabre-uri"
},
"time": "2024-09-04T15:30:08+00:00"
},
{
"name": "scheb/2fa-backup-code",
"version": "v7.13.1",

View file

@ -32,6 +32,14 @@ use App\Services\InfoProviderSystem\DTOs\SearchResultDTO;
use App\Services\InfoProviderSystem\PartInfoRetriever;
use App\Services\InfoProviderSystem\ProviderRegistry;
use App\Settings\InfoProviderSystem\GenericWebProviderSettings;
use Brick\Schema\Interfaces\BreadcrumbList;
use Brick\Schema\Interfaces\ImageObject;
use Brick\Schema\Interfaces\Product;
use Brick\Schema\Interfaces\PropertyValue;
use Brick\Schema\Interfaces\QuantitativeValue;
use Brick\Schema\Interfaces\Thing;
use Brick\Schema\SchemaReader;
use Brick\Schema\SchemaTypeList;
use Symfony\Component\DomCrawler\Crawler;
use Symfony\Contracts\HttpClient\HttpClientInterface;
@ -104,126 +112,148 @@ class GenericWebProvider implements InfoProviderInterface
return $host;
}
private function productJsonLdToPart(array $jsonLd, string $url, Crawler $dom): PartDetailDTO
private function breadcrumbToCategory(?BreadcrumbList $breadcrumbList): ?string
{
$notes = $jsonLd['description'] ?? "";
if (isset($jsonLd['disambiguatingDescription'])) {
if ($breadcrumbList === null) {
return null;
}
$items = $breadcrumbList->itemListElement->getValues();
if (count($items) < 1) {
return null;
}
try {
//Build our category from the breadcrumb items
$categories = [];
foreach ($items as $item) {
if (isset($item->name)) {
$categories[] = trim($item->name->toString());
}
}
} catch (\Throwable) {
return null;
}
return implode(' -> ', $categories);
}
private function productToPart(Product $product, string $url, Crawler $dom, ?BreadcrumbList $categoryBreadcrumb): PartDetailDTO
{
$notes = $product->description->toString() ?? "";
if ($product->disambiguatingDescription !== null) {
if (!empty($notes)) {
$notes .= "\n\n";
}
$notes .= $jsonLd['disambiguatingDescription'];
$notes .= $product->disambiguatingDescription->toString();
}
//Extract vendor infos
$vendor_infos = null;
if (isset($jsonLd['offers'])) {
if (array_is_list($jsonLd['offers'])) {
$offer = $jsonLd['offers'][0];
} else {
$offer = $jsonLd['offers'];
}
//Make $jsonLd['url'] absolute if it's relative
if (isset($jsonLd['url']) && parse_url($jsonLd['url'], PHP_URL_SCHEME) === null) {
$parsedUrl = parse_url($url);
$scheme = $parsedUrl['scheme'] ?? 'https';
$host = $parsedUrl['host'] ?? '';
$jsonLd['url'] = $scheme.'://'.$host.$jsonLd['url'];
}
$offer = $product->offers->getFirstValue();
if ($offer !== null) {
$prices = [];
if (isset($offer['price'])) {
$prices[] = new PriceDTO(
if ($offer->price->toString() !== null) {
$prices = [new PriceDTO(
minimum_discount_amount: 1,
price: (string) $offer['price'],
currency_iso_code: $offer['priceCurrency'] ?? null
);
} else if (isset($offer['offers']) && array_is_list($offer['offers'])) {
//Some sites nest offers
foreach ($offer['offers'] as $subOffer) {
if (isset($subOffer['price'])) {
$prices[] = new PriceDTO(
price: $offer->price->toString(),
currency_iso_code: $offer->priceCurrency?->toString()
)];
} else { //Check for nested offers (like IKEA does it)
$offer2 = $offer->offers->getFirstValue();
if ($offer2 !== null && $offer2->price->toString() !== null) {
$prices = [
new PriceDTO(
minimum_discount_amount: 1,
price: (string) $subOffer['price'],
currency_iso_code: $subOffer['priceCurrency'] ?? null
);
}
price: $offer2->price->toString(),
currency_iso_code: $offer2->priceCurrency?->toString()
)
];
}
}
$vendor_infos = [new PurchaseInfoDTO(
distributor_name: $this->extractShopName($url),
order_number: (string) ($jsonLd['sku'] ?? $jsonLd['@id'] ?? $jsonLd['gtin'] ?? 'Unknown'),
order_number: $product->sku?->toString() ?? $product->identifier?->toString() ?? 'Unknown',
prices: $prices,
product_url: $jsonLd['url'] ?? $url,
product_url: $offer->url?->toString() ?? $url,
)];
}
//Extract image:
$image = null;
if (isset($jsonLd['image'])) {
if (is_array($jsonLd['image'])) {
if (array_is_list($jsonLd['image'])) {
$image = $jsonLd['image'][0] ?? null;
}
} elseif (is_string($jsonLd['image'])) {
$image = $jsonLd['image'];
if ($product->image !== null) {
$imageObj = $product->image->getFirstValue();
if (is_string($imageObj)) {
$image = $imageObj;
} else if ($imageObj instanceof ImageObject) {
$image = $imageObj->contentUrl?->toString() ?? $imageObj->url?->toString();
}
}
//If image is an object with @type ImageObject, extract the url
if (is_array($image) && isset($image['@type']) && $image['@type'] === 'ImageObject') {
$image = $image['contentUrl'] ?? $image['url'] ?? null;
}
//Try to extract parameters from additionalProperty
//Extract parameters from additionalProperty
$parameters = [];
if (isset($jsonLd['additionalProperty']) && array_is_list($jsonLd['additionalProperty'])) {
foreach ($jsonLd['additionalProperty'] as $property) { //TODO: Handle minValue and maxValue
if (isset ($property['unitText'])) {
foreach ($product->additionalProperty->getValues() as $property) {
if ($property instanceof PropertyValue) { //TODO: Handle minValue and maxValue
if ($property->unitText->toString() !== null) {
$parameters[] = ParameterDTO::parseValueField(
name: $property['name'] ?? 'Unknown',
value: $property['value'] ?? '',
unit: $property['unitText']
name: $property->name->toString() ?? 'Unknown',
value: $property->value->toString() ?? '',
unit: $property->unitText->toString()
);
} else {
$parameters[] = ParameterDTO::parseValueIncludingUnit(
name: $property['name'] ?? 'Unknown',
value: $property['value'] ?? ''
name: $property->name->toString() ?? 'Unknown',
value: $property->value->toString() ?? ''
);
}
}
}
//Try to extract weight
$mass = null;
if (($weight = $product?->weight->getFirstValue()) instanceof QuantitativeValue) {
$mass = $weight->value->toString();
}
return new PartDetailDTO(
provider_key: $this->getProviderKey(),
provider_id: $url,
name: $jsonLd ['name'] ?? 'Unknown Name',
name: $product->name?->toString() ?? $product->alternateName?->toString() ?? $product?->mpn->toString() ?? 'Unknown Name',
description: $this->getMetaContent($dom, 'og:description') ?? $this->getMetaContent($dom, 'description') ?? '',
category: isset($jsonLd['category']) && is_string($jsonLd['category']) ? $jsonLd['category'] : null,
manufacturer: $jsonLd['manufacturer']['name'] ?? $jsonLd['brand']['name'] ?? null,
mpn: $jsonLd['mpn'] ?? null,
category: $this->breadcrumbToCategory($categoryBreadcrumb) ?? $product->category?->toString(),
manufacturer: self::propertyOrString($product->manufacturer) ?? self::propertyOrString($product->brand),
mpn: $product->mpn?->toString(),
preview_image_url: $image,
provider_url: $url,
notes: $notes,
parameters: $parameters,
vendor_infos: $vendor_infos,
mass: isset($jsonLd['weight']['value']) ? (float)$jsonLd['weight']['value'] : null,
mass: $mass
);
}
/**
* Decodes JSON in a forgiving way, trying to fix common issues.
* @param string $json
* @return array
* @throws \JsonException
*/
private function json_decode_forgiving(string $json): array
private static function propertyOrString(SchemaTypeList|Thing|string|null $value, string $property = "name"): ?string
{
//Sanitize common issues
$json = preg_replace("/[\r\n]+/", " ", $json);
return json_decode($json, true, 512, JSON_THROW_ON_ERROR);
if ($value instanceof SchemaTypeList) {
$value = $value->getFirstValue();
}
if ($value === null) {
return null;
}
if (is_string($value)) {
return $value;
}
if ($value instanceof Thing) {
return $value->$property?->toString();
}
return null;
}
/**
* Gets the content of a meta tag by its name or property attribute, or null if not found
* @param Crawler $dom
@ -336,18 +366,23 @@ class GenericWebProvider implements InfoProviderInterface
$canonicalURL = $scheme.'://'.$host.$canonicalURL;
}
//Try to find json-ld data in the head
$jsonLdNodes = $dom->filter('script[type="application/ld+json"]');
foreach ($jsonLdNodes as $node) {
$jsonLd = $this->json_decode_forgiving($node->textContent);
//If the content of json-ld is an array, try to find a product inside
if (!array_is_list($jsonLd)) {
$jsonLd = [$jsonLd];
$schemaReader = SchemaReader::forAllFormats();
$things = $schemaReader->readHtml($content, $canonicalURL);
//Try to find a breadcrumb schema to extract the category
$categoryBreadCrumbs = null;
foreach ($things as $thing) {
if ($thing instanceof BreadcrumbList) {
$categoryBreadCrumbs = $thing;
break;
}
foreach ($jsonLd as $item) {
if (isset($item['@type']) && $item['@type'] === 'Product') {
return $this->productJsonLdToPart($item, $canonicalURL, $dom);
}
}
//Try to find a Product schema
foreach ($things as $thing) {
if ($thing instanceof Product) {
return $this->productToPart($thing, $canonicalURL, $dom, $categoryBreadCrumbs);
}
}