Use brick schema to implement GenericWebProvider

This is less error prone than our own parser and also allows to parse Microdata and rdfa lite to support more webshops
This commit is contained in:
Jan Böhmer 2026-02-03 23:04:18 +01:00
parent 518953ad45
commit b48de83a32
3 changed files with 260 additions and 84 deletions

View file

@ -18,6 +18,7 @@
"api-platform/symfony": "^4.0.0", "api-platform/symfony": "^4.0.0",
"beberlei/doctrineextensions": "^1.2", "beberlei/doctrineextensions": "^1.2",
"brick/math": "^0.13.1", "brick/math": "^0.13.1",
"brick/schema": "^0.2.0",
"composer/ca-bundle": "^1.5", "composer/ca-bundle": "^1.5",
"composer/package-versions-deprecated": "^1.11.99.5", "composer/package-versions-deprecated": "^1.11.99.5",
"doctrine/data-fixtures": "^2.0.0", "doctrine/data-fixtures": "^2.0.0",

173
composer.lock generated
View file

@ -4,7 +4,7 @@
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies", "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies",
"This file is @generated automatically" "This file is @generated automatically"
], ],
"content-hash": "8e387d6d016f33eb7302c47ecb7a12b9", "content-hash": "7ca9c95fb85f6bf3d9b8a3aa98ca33f6",
"packages": [ "packages": [
{ {
"name": "amphp/amp", "name": "amphp/amp",
@ -2387,6 +2387,117 @@
], ],
"time": "2025-03-29T13:50:30+00:00" "time": "2025-03-29T13:50:30+00:00"
}, },
{
"name": "brick/schema",
"version": "0.2.0",
"source": {
"type": "git",
"url": "https://github.com/brick/schema.git",
"reference": "b5114bf5e8092430041a37efe1cfd5279ca764c0"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/brick/schema/zipball/b5114bf5e8092430041a37efe1cfd5279ca764c0",
"reference": "b5114bf5e8092430041a37efe1cfd5279ca764c0",
"shasum": ""
},
"require": {
"brick/structured-data": "~0.1.0 || ~0.2.0",
"ext-dom": "*",
"php": "^8.1"
},
"require-dev": {
"brick/varexporter": "^0.6",
"vimeo/psalm": "6.12.0"
},
"type": "library",
"autoload": {
"psr-4": {
"Brick\\Schema\\": "src/"
}
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"MIT"
],
"description": "Schema.org library for PHP",
"keywords": [
"JSON-LD",
"brick",
"microdata",
"rdfa lite",
"schema",
"schema.org",
"structured data"
],
"support": {
"issues": "https://github.com/brick/schema/issues",
"source": "https://github.com/brick/schema/tree/0.2.0"
},
"funding": [
{
"url": "https://github.com/BenMorel",
"type": "github"
}
],
"time": "2025-06-12T07:03:20+00:00"
},
{
"name": "brick/structured-data",
"version": "0.2.0",
"source": {
"type": "git",
"url": "https://github.com/brick/structured-data.git",
"reference": "be9b28720e2aba87f19c90500700970be85affde"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/brick/structured-data/zipball/be9b28720e2aba87f19c90500700970be85affde",
"reference": "be9b28720e2aba87f19c90500700970be85affde",
"shasum": ""
},
"require": {
"ext-dom": "*",
"ext-json": "*",
"ext-libxml": "*",
"php": "^8.1",
"sabre/uri": "^2.1 || ^3.0"
},
"require-dev": {
"php-coveralls/php-coveralls": "^2.0",
"phpunit/phpunit": "^8.0 || ^9.0",
"vimeo/psalm": "6.12.0"
},
"type": "library",
"autoload": {
"psr-4": {
"Brick\\StructuredData\\": "src/"
}
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"MIT"
],
"description": "Microdata, RDFa Lite & JSON-LD structured data reader",
"keywords": [
"JSON-LD",
"brick",
"microdata",
"rdfa",
"structured data"
],
"support": {
"issues": "https://github.com/brick/structured-data/issues",
"source": "https://github.com/brick/structured-data/tree/0.2.0"
},
"funding": [
{
"url": "https://github.com/BenMorel",
"type": "github"
}
],
"time": "2025-06-10T23:48:46+00:00"
},
{ {
"name": "composer/ca-bundle", "name": "composer/ca-bundle",
"version": "1.5.10", "version": "1.5.10",
@ -9595,6 +9706,66 @@
}, },
"time": "2025-09-14T07:37:21+00:00" "time": "2025-09-14T07:37:21+00:00"
}, },
{
"name": "sabre/uri",
"version": "3.0.2",
"source": {
"type": "git",
"url": "https://github.com/sabre-io/uri.git",
"reference": "38eeab6ed9eec435a2188db489d4649c56272c51"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/sabre-io/uri/zipball/38eeab6ed9eec435a2188db489d4649c56272c51",
"reference": "38eeab6ed9eec435a2188db489d4649c56272c51",
"shasum": ""
},
"require": {
"php": "^7.4 || ^8.0"
},
"require-dev": {
"friendsofphp/php-cs-fixer": "^3.64",
"phpstan/extension-installer": "^1.4",
"phpstan/phpstan": "^1.12",
"phpstan/phpstan-phpunit": "^1.4",
"phpstan/phpstan-strict-rules": "^1.6",
"phpunit/phpunit": "^9.6"
},
"type": "library",
"autoload": {
"files": [
"lib/functions.php"
],
"psr-4": {
"Sabre\\Uri\\": "lib/"
}
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"BSD-3-Clause"
],
"authors": [
{
"name": "Evert Pot",
"email": "me@evertpot.com",
"homepage": "http://evertpot.com/",
"role": "Developer"
}
],
"description": "Functions for making sense out of URIs.",
"homepage": "http://sabre.io/uri/",
"keywords": [
"rfc3986",
"uri",
"url"
],
"support": {
"forum": "https://groups.google.com/group/sabredav-discuss",
"issues": "https://github.com/sabre-io/uri/issues",
"source": "https://github.com/fruux/sabre-uri"
},
"time": "2024-09-04T15:30:08+00:00"
},
{ {
"name": "scheb/2fa-backup-code", "name": "scheb/2fa-backup-code",
"version": "v7.13.1", "version": "v7.13.1",

View file

@ -32,6 +32,18 @@ use App\Services\InfoProviderSystem\DTOs\SearchResultDTO;
use App\Services\InfoProviderSystem\PartInfoRetriever; use App\Services\InfoProviderSystem\PartInfoRetriever;
use App\Services\InfoProviderSystem\ProviderRegistry; use App\Services\InfoProviderSystem\ProviderRegistry;
use App\Settings\InfoProviderSystem\GenericWebProviderSettings; use App\Settings\InfoProviderSystem\GenericWebProviderSettings;
use Brick\Schema\Interfaces\ImageObject;
use Brick\Schema\Interfaces\Product;
use Brick\Schema\Interfaces\PropertyValue;
use Brick\Schema\Interfaces\QuantitativeValue;
use Brick\Schema\Interfaces\Thing;
use Brick\Schema\SchemaReader;
use Brick\Schema\SchemaTypeList;
use Brick\StructuredData\HTMLReader;
use Brick\StructuredData\Reader\JsonLdReader;
use Brick\StructuredData\Reader\MicrodataReader;
use Brick\StructuredData\Reader\RdfaLiteReader;
use Brick\StructuredData\Reader\ReaderChain;
use Symfony\Component\DomCrawler\Crawler; use Symfony\Component\DomCrawler\Crawler;
use Symfony\Contracts\HttpClient\HttpClientInterface; use Symfony\Contracts\HttpClient\HttpClientInterface;
@ -104,126 +116,122 @@ class GenericWebProvider implements InfoProviderInterface
return $host; return $host;
} }
private function productJsonLdToPart(array $jsonLd, string $url, Crawler $dom): PartDetailDTO private function productToPart(Product $product, string $url, Crawler $dom): PartDetailDTO
{ {
$notes = $jsonLd['description'] ?? ""; $notes = $product->description->toString() ?? "";
if (isset($jsonLd['disambiguatingDescription'])) { if ($product->disambiguatingDescription !== null) {
if (!empty($notes)) { if (!empty($notes)) {
$notes .= "\n\n"; $notes .= "\n\n";
} }
$notes .= $jsonLd['disambiguatingDescription']; $notes .= $product->disambiguatingDescription->toString();
} }
//Extract vendor infos
$vendor_infos = null; $vendor_infos = null;
if (isset($jsonLd['offers'])) { $offer = $product->offers->getFirstValue();
if ($offer !== null) {
if (array_is_list($jsonLd['offers'])) {
$offer = $jsonLd['offers'][0];
} else {
$offer = $jsonLd['offers'];
}
//Make $jsonLd['url'] absolute if it's relative
if (isset($jsonLd['url']) && parse_url($jsonLd['url'], PHP_URL_SCHEME) === null) {
$parsedUrl = parse_url($url);
$scheme = $parsedUrl['scheme'] ?? 'https';
$host = $parsedUrl['host'] ?? '';
$jsonLd['url'] = $scheme.'://'.$host.$jsonLd['url'];
}
$prices = []; $prices = [];
if (isset($offer['price'])) { if ($offer->price->toString() !== null) {
$prices[] = new PriceDTO( $prices = [new PriceDTO(
minimum_discount_amount: 1, minimum_discount_amount: 1,
price: (string) $offer['price'], price: $offer->price->toString(),
currency_iso_code: $offer['priceCurrency'] ?? null currency_iso_code: $offer->priceCurrency?->toString()
); )];
} else if (isset($offer['offers']) && array_is_list($offer['offers'])) { } else { //Check for nested offers (like IKEA does it)
//Some sites nest offers $offer2 = $offer->offers->getFirstValue();
foreach ($offer['offers'] as $subOffer) { if ($offer2 !== null && $offer2->price->toString() !== null) {
if (isset($subOffer['price'])) { $prices = [
$prices[] = new PriceDTO( new PriceDTO(
minimum_discount_amount: 1, minimum_discount_amount: 1,
price: (string) $subOffer['price'], price: $offer2->price->toString(),
currency_iso_code: $subOffer['priceCurrency'] ?? null currency_iso_code: $offer2->priceCurrency?->toString()
); )
} ];
} }
} }
$vendor_infos = [new PurchaseInfoDTO( $vendor_infos = [new PurchaseInfoDTO(
distributor_name: $this->extractShopName($url), distributor_name: $this->extractShopName($url),
order_number: (string) ($jsonLd['sku'] ?? $jsonLd['@id'] ?? $jsonLd['gtin'] ?? 'Unknown'), order_number: $product->sku?->toString() ?? $product->identifier?->toString() ?? 'Unknown',
prices: $prices, prices: $prices,
product_url: $jsonLd['url'] ?? $url, product_url: $offer->url?->toString() ?? $url,
)]; )];
} }
//Extract image:
$image = null; $image = null;
if (isset($jsonLd['image'])) { if ($product->image !== null) {
if (is_array($jsonLd['image'])) { $imageObj = $product->image->getFirstValue();
if (array_is_list($jsonLd['image'])) { if (is_string($imageObj)) {
$image = $jsonLd['image'][0] ?? null; $image = $imageObj;
} } else if ($imageObj instanceof ImageObject) {
} elseif (is_string($jsonLd['image'])) { $image = $imageObj->contentUrl?->toString() ?? $imageObj->url?->toString();
$image = $jsonLd['image'];
} }
} }
//If image is an object with @type ImageObject, extract the url
if (is_array($image) && isset($image['@type']) && $image['@type'] === 'ImageObject') {
$image = $image['contentUrl'] ?? $image['url'] ?? null;
}
//Try to extract parameters from additionalProperty //Extract parameters from additionalProperty
$parameters = []; $parameters = [];
if (isset($jsonLd['additionalProperty']) && array_is_list($jsonLd['additionalProperty'])) { foreach ($product->additionalProperty->getValues() as $property) {
foreach ($jsonLd['additionalProperty'] as $property) { //TODO: Handle minValue and maxValue if ($property instanceof PropertyValue) { //TODO: Handle minValue and maxValue
if (isset ($property['unitText'])) { if ($property->unitText->toString() !== null) {
$parameters[] = ParameterDTO::parseValueField( $parameters[] = ParameterDTO::parseValueField(
name: $property['name'] ?? 'Unknown', name: $property->name->toString() ?? 'Unknown',
value: $property['value'] ?? '', value: $property->value->toString() ?? '',
unit: $property['unitText'] unit: $property->unitText->toString()
); );
} else { } else {
$parameters[] = ParameterDTO::parseValueIncludingUnit( $parameters[] = ParameterDTO::parseValueIncludingUnit(
name: $property['name'] ?? 'Unknown', name: $property->name->toString() ?? 'Unknown',
value: $property['value'] ?? '' value: $property->value->toString() ?? ''
); );
} }
} }
} }
//Try to extract weight
$mass = null;
if (($weight = $product?->weight->getFirstValue()) instanceof QuantitativeValue) {
$mass = $weight->value->toString();
}
return new PartDetailDTO( return new PartDetailDTO(
provider_key: $this->getProviderKey(), provider_key: $this->getProviderKey(),
provider_id: $url, provider_id: $url,
name: $jsonLd ['name'] ?? 'Unknown Name', name: $product->name?->toString() ?? $product->alternateName?->toString() ?? $product?->mpn->toString() ?? 'Unknown Name',
description: $this->getMetaContent($dom, 'og:description') ?? $this->getMetaContent($dom, 'description') ?? '', description: $this->getMetaContent($dom, 'og:description') ?? $this->getMetaContent($dom, 'description') ?? '',
category: isset($jsonLd['category']) && is_string($jsonLd['category']) ? $jsonLd['category'] : null, category: $product->category?->toString(),
manufacturer: $jsonLd['manufacturer']['name'] ?? $jsonLd['brand']['name'] ?? null, manufacturer: self::propertyOrString($product->manufacturer) ?? self::propertyOrString($product->brand),
mpn: $jsonLd['mpn'] ?? null, mpn: $product->mpn?->toString(),
preview_image_url: $image, preview_image_url: $image,
provider_url: $url, provider_url: $url,
notes: $notes, notes: $notes,
parameters: $parameters, parameters: $parameters,
vendor_infos: $vendor_infos, vendor_infos: $vendor_infos,
mass: isset($jsonLd['weight']['value']) ? (float)$jsonLd['weight']['value'] : null, mass: $mass
); );
} }
/** private static function propertyOrString(SchemaTypeList|Thing|string|null $value, string $property = "name"): ?string
* Decodes JSON in a forgiving way, trying to fix common issues.
* @param string $json
* @return array
* @throws \JsonException
*/
private function json_decode_forgiving(string $json): array
{ {
//Sanitize common issues if ($value instanceof SchemaTypeList) {
$json = preg_replace("/[\r\n]+/", " ", $json); $value = $value->getFirstValue();
return json_decode($json, true, 512, JSON_THROW_ON_ERROR); }
if ($value === null) {
return null;
}
if (is_string($value)) {
return $value;
}
if ($value instanceof Thing) {
return $value->$property?->toString();
}
return null;
} }
/** /**
* Gets the content of a meta tag by its name or property attribute, or null if not found * Gets the content of a meta tag by its name or property attribute, or null if not found
* @param Crawler $dom * @param Crawler $dom
@ -336,18 +344,14 @@ class GenericWebProvider implements InfoProviderInterface
$canonicalURL = $scheme.'://'.$host.$canonicalURL; $canonicalURL = $scheme.'://'.$host.$canonicalURL;
} }
//Try to find json-ld data in the head
$jsonLdNodes = $dom->filter('script[type="application/ld+json"]'); $schemaReader = SchemaReader::forAllFormats();
foreach ($jsonLdNodes as $node) { $things = $schemaReader->readHtml($content, $canonicalURL);
$jsonLd = $this->json_decode_forgiving($node->textContent);
//If the content of json-ld is an array, try to find a product inside //Try to find a Product schema
if (!array_is_list($jsonLd)) { foreach ($things as $thing) {
$jsonLd = [$jsonLd]; if ($thing instanceof Product) {
} return $this->productToPart($thing, $canonicalURL, $dom);
foreach ($jsonLd as $item) {
if (isset($item['@type']) && $item['@type'] === 'Product') {
return $this->productJsonLdToPart($item, $canonicalURL, $dom);
}
} }
} }