From b48de83a3289d6df55a90b4baaa12fb9603612d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20B=C3=B6hmer?= Date: Tue, 3 Feb 2026 23:04:18 +0100 Subject: [PATCH] Use brick schema to implement GenericWebProvider This is less error prone than our own parser and also allows to parse Microdata and rdfa lite to support more webshops --- composer.json | 1 + composer.lock | 173 +++++++++++++++++- .../Providers/GenericWebProvider.php | 170 ++++++++--------- 3 files changed, 260 insertions(+), 84 deletions(-) diff --git a/composer.json b/composer.json index 8ce686c2..36dd461e 100644 --- a/composer.json +++ b/composer.json @@ -18,6 +18,7 @@ "api-platform/symfony": "^4.0.0", "beberlei/doctrineextensions": "^1.2", "brick/math": "^0.13.1", + "brick/schema": "^0.2.0", "composer/ca-bundle": "^1.5", "composer/package-versions-deprecated": "^1.11.99.5", "doctrine/data-fixtures": "^2.0.0", diff --git a/composer.lock b/composer.lock index 56ab8701..28d7c981 100644 --- a/composer.lock +++ b/composer.lock @@ -4,7 +4,7 @@ "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies", "This file is @generated automatically" ], - "content-hash": "8e387d6d016f33eb7302c47ecb7a12b9", + "content-hash": "7ca9c95fb85f6bf3d9b8a3aa98ca33f6", "packages": [ { "name": "amphp/amp", @@ -2387,6 +2387,117 @@ ], "time": "2025-03-29T13:50:30+00:00" }, + { + "name": "brick/schema", + "version": "0.2.0", + "source": { + "type": "git", + "url": "https://github.com/brick/schema.git", + "reference": "b5114bf5e8092430041a37efe1cfd5279ca764c0" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/brick/schema/zipball/b5114bf5e8092430041a37efe1cfd5279ca764c0", + "reference": "b5114bf5e8092430041a37efe1cfd5279ca764c0", + "shasum": "" + }, + "require": { + "brick/structured-data": "~0.1.0 || ~0.2.0", + "ext-dom": "*", + "php": "^8.1" + }, + "require-dev": { + "brick/varexporter": "^0.6", + "vimeo/psalm": "6.12.0" + }, + "type": "library", + "autoload": { + "psr-4": { + "Brick\\Schema\\": "src/" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "description": "Schema.org library for PHP", + "keywords": [ + "JSON-LD", + "brick", + "microdata", + "rdfa lite", + "schema", + "schema.org", + "structured data" + ], + "support": { + "issues": "https://github.com/brick/schema/issues", + "source": "https://github.com/brick/schema/tree/0.2.0" + }, + "funding": [ + { + "url": "https://github.com/BenMorel", + "type": "github" + } + ], + "time": "2025-06-12T07:03:20+00:00" + }, + { + "name": "brick/structured-data", + "version": "0.2.0", + "source": { + "type": "git", + "url": "https://github.com/brick/structured-data.git", + "reference": "be9b28720e2aba87f19c90500700970be85affde" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/brick/structured-data/zipball/be9b28720e2aba87f19c90500700970be85affde", + "reference": "be9b28720e2aba87f19c90500700970be85affde", + "shasum": "" + }, + "require": { + "ext-dom": "*", + "ext-json": "*", + "ext-libxml": "*", + "php": "^8.1", + "sabre/uri": "^2.1 || ^3.0" + }, + "require-dev": { + "php-coveralls/php-coveralls": "^2.0", + "phpunit/phpunit": "^8.0 || ^9.0", + "vimeo/psalm": "6.12.0" + }, + "type": "library", + "autoload": { + "psr-4": { + "Brick\\StructuredData\\": "src/" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "description": "Microdata, RDFa Lite & JSON-LD structured data reader", + "keywords": [ + "JSON-LD", + "brick", + "microdata", + "rdfa", + "structured data" + ], + "support": { + "issues": "https://github.com/brick/structured-data/issues", + "source": "https://github.com/brick/structured-data/tree/0.2.0" + }, + "funding": [ + { + "url": "https://github.com/BenMorel", + "type": "github" + } + ], + "time": "2025-06-10T23:48:46+00:00" + }, { "name": "composer/ca-bundle", "version": "1.5.10", @@ -9595,6 +9706,66 @@ }, "time": "2025-09-14T07:37:21+00:00" }, + { + "name": "sabre/uri", + "version": "3.0.2", + "source": { + "type": "git", + "url": "https://github.com/sabre-io/uri.git", + "reference": "38eeab6ed9eec435a2188db489d4649c56272c51" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/sabre-io/uri/zipball/38eeab6ed9eec435a2188db489d4649c56272c51", + "reference": "38eeab6ed9eec435a2188db489d4649c56272c51", + "shasum": "" + }, + "require": { + "php": "^7.4 || ^8.0" + }, + "require-dev": { + "friendsofphp/php-cs-fixer": "^3.64", + "phpstan/extension-installer": "^1.4", + "phpstan/phpstan": "^1.12", + "phpstan/phpstan-phpunit": "^1.4", + "phpstan/phpstan-strict-rules": "^1.6", + "phpunit/phpunit": "^9.6" + }, + "type": "library", + "autoload": { + "files": [ + "lib/functions.php" + ], + "psr-4": { + "Sabre\\Uri\\": "lib/" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "BSD-3-Clause" + ], + "authors": [ + { + "name": "Evert Pot", + "email": "me@evertpot.com", + "homepage": "http://evertpot.com/", + "role": "Developer" + } + ], + "description": "Functions for making sense out of URIs.", + "homepage": "http://sabre.io/uri/", + "keywords": [ + "rfc3986", + "uri", + "url" + ], + "support": { + "forum": "https://groups.google.com/group/sabredav-discuss", + "issues": "https://github.com/sabre-io/uri/issues", + "source": "https://github.com/fruux/sabre-uri" + }, + "time": "2024-09-04T15:30:08+00:00" + }, { "name": "scheb/2fa-backup-code", "version": "v7.13.1", diff --git a/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php b/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php index 66d45707..e85ce5f4 100644 --- a/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php +++ b/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php @@ -32,6 +32,18 @@ use App\Services\InfoProviderSystem\DTOs\SearchResultDTO; use App\Services\InfoProviderSystem\PartInfoRetriever; use App\Services\InfoProviderSystem\ProviderRegistry; use App\Settings\InfoProviderSystem\GenericWebProviderSettings; +use Brick\Schema\Interfaces\ImageObject; +use Brick\Schema\Interfaces\Product; +use Brick\Schema\Interfaces\PropertyValue; +use Brick\Schema\Interfaces\QuantitativeValue; +use Brick\Schema\Interfaces\Thing; +use Brick\Schema\SchemaReader; +use Brick\Schema\SchemaTypeList; +use Brick\StructuredData\HTMLReader; +use Brick\StructuredData\Reader\JsonLdReader; +use Brick\StructuredData\Reader\MicrodataReader; +use Brick\StructuredData\Reader\RdfaLiteReader; +use Brick\StructuredData\Reader\ReaderChain; use Symfony\Component\DomCrawler\Crawler; use Symfony\Contracts\HttpClient\HttpClientInterface; @@ -104,126 +116,122 @@ class GenericWebProvider implements InfoProviderInterface return $host; } - private function productJsonLdToPart(array $jsonLd, string $url, Crawler $dom): PartDetailDTO + private function productToPart(Product $product, string $url, Crawler $dom): PartDetailDTO { - $notes = $jsonLd['description'] ?? ""; - if (isset($jsonLd['disambiguatingDescription'])) { + $notes = $product->description->toString() ?? ""; + if ($product->disambiguatingDescription !== null) { if (!empty($notes)) { $notes .= "\n\n"; } - $notes .= $jsonLd['disambiguatingDescription']; + $notes .= $product->disambiguatingDescription->toString(); } + + //Extract vendor infos $vendor_infos = null; - if (isset($jsonLd['offers'])) { - - if (array_is_list($jsonLd['offers'])) { - $offer = $jsonLd['offers'][0]; - } else { - $offer = $jsonLd['offers']; - } - - //Make $jsonLd['url'] absolute if it's relative - if (isset($jsonLd['url']) && parse_url($jsonLd['url'], PHP_URL_SCHEME) === null) { - $parsedUrl = parse_url($url); - $scheme = $parsedUrl['scheme'] ?? 'https'; - $host = $parsedUrl['host'] ?? ''; - $jsonLd['url'] = $scheme.'://'.$host.$jsonLd['url']; - } - + $offer = $product->offers->getFirstValue(); + if ($offer !== null) { $prices = []; - if (isset($offer['price'])) { - $prices[] = new PriceDTO( + if ($offer->price->toString() !== null) { + $prices = [new PriceDTO( minimum_discount_amount: 1, - price: (string) $offer['price'], - currency_iso_code: $offer['priceCurrency'] ?? null - ); - } else if (isset($offer['offers']) && array_is_list($offer['offers'])) { - //Some sites nest offers - foreach ($offer['offers'] as $subOffer) { - if (isset($subOffer['price'])) { - $prices[] = new PriceDTO( + price: $offer->price->toString(), + currency_iso_code: $offer->priceCurrency?->toString() + )]; + } else { //Check for nested offers (like IKEA does it) + $offer2 = $offer->offers->getFirstValue(); + if ($offer2 !== null && $offer2->price->toString() !== null) { + $prices = [ + new PriceDTO( minimum_discount_amount: 1, - price: (string) $subOffer['price'], - currency_iso_code: $subOffer['priceCurrency'] ?? null - ); - } + price: $offer2->price->toString(), + currency_iso_code: $offer2->priceCurrency?->toString() + ) + ]; } } $vendor_infos = [new PurchaseInfoDTO( distributor_name: $this->extractShopName($url), - order_number: (string) ($jsonLd['sku'] ?? $jsonLd['@id'] ?? $jsonLd['gtin'] ?? 'Unknown'), + order_number: $product->sku?->toString() ?? $product->identifier?->toString() ?? 'Unknown', prices: $prices, - product_url: $jsonLd['url'] ?? $url, + product_url: $offer->url?->toString() ?? $url, )]; } + //Extract image: $image = null; - if (isset($jsonLd['image'])) { - if (is_array($jsonLd['image'])) { - if (array_is_list($jsonLd['image'])) { - $image = $jsonLd['image'][0] ?? null; - } - } elseif (is_string($jsonLd['image'])) { - $image = $jsonLd['image']; + if ($product->image !== null) { + $imageObj = $product->image->getFirstValue(); + if (is_string($imageObj)) { + $image = $imageObj; + } else if ($imageObj instanceof ImageObject) { + $image = $imageObj->contentUrl?->toString() ?? $imageObj->url?->toString(); } } - //If image is an object with @type ImageObject, extract the url - if (is_array($image) && isset($image['@type']) && $image['@type'] === 'ImageObject') { - $image = $image['contentUrl'] ?? $image['url'] ?? null; - } - //Try to extract parameters from additionalProperty + //Extract parameters from additionalProperty $parameters = []; - if (isset($jsonLd['additionalProperty']) && array_is_list($jsonLd['additionalProperty'])) { - foreach ($jsonLd['additionalProperty'] as $property) { //TODO: Handle minValue and maxValue - if (isset ($property['unitText'])) { + foreach ($product->additionalProperty->getValues() as $property) { + if ($property instanceof PropertyValue) { //TODO: Handle minValue and maxValue + if ($property->unitText->toString() !== null) { $parameters[] = ParameterDTO::parseValueField( - name: $property['name'] ?? 'Unknown', - value: $property['value'] ?? '', - unit: $property['unitText'] + name: $property->name->toString() ?? 'Unknown', + value: $property->value->toString() ?? '', + unit: $property->unitText->toString() ); } else { $parameters[] = ParameterDTO::parseValueIncludingUnit( - name: $property['name'] ?? 'Unknown', - value: $property['value'] ?? '' + name: $property->name->toString() ?? 'Unknown', + value: $property->value->toString() ?? '' ); } } } + //Try to extract weight + $mass = null; + if (($weight = $product?->weight->getFirstValue()) instanceof QuantitativeValue) { + $mass = $weight->value->toString(); + } return new PartDetailDTO( provider_key: $this->getProviderKey(), provider_id: $url, - name: $jsonLd ['name'] ?? 'Unknown Name', + name: $product->name?->toString() ?? $product->alternateName?->toString() ?? $product?->mpn->toString() ?? 'Unknown Name', description: $this->getMetaContent($dom, 'og:description') ?? $this->getMetaContent($dom, 'description') ?? '', - category: isset($jsonLd['category']) && is_string($jsonLd['category']) ? $jsonLd['category'] : null, - manufacturer: $jsonLd['manufacturer']['name'] ?? $jsonLd['brand']['name'] ?? null, - mpn: $jsonLd['mpn'] ?? null, + category: $product->category?->toString(), + manufacturer: self::propertyOrString($product->manufacturer) ?? self::propertyOrString($product->brand), + mpn: $product->mpn?->toString(), preview_image_url: $image, provider_url: $url, notes: $notes, parameters: $parameters, vendor_infos: $vendor_infos, - mass: isset($jsonLd['weight']['value']) ? (float)$jsonLd['weight']['value'] : null, + mass: $mass ); } - /** - * Decodes JSON in a forgiving way, trying to fix common issues. - * @param string $json - * @return array - * @throws \JsonException - */ - private function json_decode_forgiving(string $json): array + private static function propertyOrString(SchemaTypeList|Thing|string|null $value, string $property = "name"): ?string { - //Sanitize common issues - $json = preg_replace("/[\r\n]+/", " ", $json); - return json_decode($json, true, 512, JSON_THROW_ON_ERROR); + if ($value instanceof SchemaTypeList) { + $value = $value->getFirstValue(); + } + if ($value === null) { + return null; + } + + if (is_string($value)) { + return $value; + } + + if ($value instanceof Thing) { + return $value->$property?->toString(); + } + return null; } + /** * Gets the content of a meta tag by its name or property attribute, or null if not found * @param Crawler $dom @@ -336,18 +344,14 @@ class GenericWebProvider implements InfoProviderInterface $canonicalURL = $scheme.'://'.$host.$canonicalURL; } - //Try to find json-ld data in the head - $jsonLdNodes = $dom->filter('script[type="application/ld+json"]'); - foreach ($jsonLdNodes as $node) { - $jsonLd = $this->json_decode_forgiving($node->textContent); - //If the content of json-ld is an array, try to find a product inside - if (!array_is_list($jsonLd)) { - $jsonLd = [$jsonLd]; - } - foreach ($jsonLd as $item) { - if (isset($item['@type']) && $item['@type'] === 'Product') { - return $this->productJsonLdToPart($item, $canonicalURL, $dom); - } + + $schemaReader = SchemaReader::forAllFormats(); + $things = $schemaReader->readHtml($content, $canonicalURL); + + //Try to find a Product schema + foreach ($things as $thing) { + if ($thing instanceof Product) { + return $this->productToPart($thing, $canonicalURL, $dom); } }