mirror of
https://github.com/Part-DB/Part-DB-server.git
synced 2026-02-11 12:09:36 +00:00
Use brick schema to implement GenericWebProvider
This is less error prone than our own parser and also allows to parse Microdata and rdfa lite to support more webshops
This commit is contained in:
parent
518953ad45
commit
b48de83a32
3 changed files with 260 additions and 84 deletions
|
|
@ -18,6 +18,7 @@
|
||||||
"api-platform/symfony": "^4.0.0",
|
"api-platform/symfony": "^4.0.0",
|
||||||
"beberlei/doctrineextensions": "^1.2",
|
"beberlei/doctrineextensions": "^1.2",
|
||||||
"brick/math": "^0.13.1",
|
"brick/math": "^0.13.1",
|
||||||
|
"brick/schema": "^0.2.0",
|
||||||
"composer/ca-bundle": "^1.5",
|
"composer/ca-bundle": "^1.5",
|
||||||
"composer/package-versions-deprecated": "^1.11.99.5",
|
"composer/package-versions-deprecated": "^1.11.99.5",
|
||||||
"doctrine/data-fixtures": "^2.0.0",
|
"doctrine/data-fixtures": "^2.0.0",
|
||||||
|
|
|
||||||
173
composer.lock
generated
173
composer.lock
generated
|
|
@ -4,7 +4,7 @@
|
||||||
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies",
|
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies",
|
||||||
"This file is @generated automatically"
|
"This file is @generated automatically"
|
||||||
],
|
],
|
||||||
"content-hash": "8e387d6d016f33eb7302c47ecb7a12b9",
|
"content-hash": "7ca9c95fb85f6bf3d9b8a3aa98ca33f6",
|
||||||
"packages": [
|
"packages": [
|
||||||
{
|
{
|
||||||
"name": "amphp/amp",
|
"name": "amphp/amp",
|
||||||
|
|
@ -2387,6 +2387,117 @@
|
||||||
],
|
],
|
||||||
"time": "2025-03-29T13:50:30+00:00"
|
"time": "2025-03-29T13:50:30+00:00"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"name": "brick/schema",
|
||||||
|
"version": "0.2.0",
|
||||||
|
"source": {
|
||||||
|
"type": "git",
|
||||||
|
"url": "https://github.com/brick/schema.git",
|
||||||
|
"reference": "b5114bf5e8092430041a37efe1cfd5279ca764c0"
|
||||||
|
},
|
||||||
|
"dist": {
|
||||||
|
"type": "zip",
|
||||||
|
"url": "https://api.github.com/repos/brick/schema/zipball/b5114bf5e8092430041a37efe1cfd5279ca764c0",
|
||||||
|
"reference": "b5114bf5e8092430041a37efe1cfd5279ca764c0",
|
||||||
|
"shasum": ""
|
||||||
|
},
|
||||||
|
"require": {
|
||||||
|
"brick/structured-data": "~0.1.0 || ~0.2.0",
|
||||||
|
"ext-dom": "*",
|
||||||
|
"php": "^8.1"
|
||||||
|
},
|
||||||
|
"require-dev": {
|
||||||
|
"brick/varexporter": "^0.6",
|
||||||
|
"vimeo/psalm": "6.12.0"
|
||||||
|
},
|
||||||
|
"type": "library",
|
||||||
|
"autoload": {
|
||||||
|
"psr-4": {
|
||||||
|
"Brick\\Schema\\": "src/"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"notification-url": "https://packagist.org/downloads/",
|
||||||
|
"license": [
|
||||||
|
"MIT"
|
||||||
|
],
|
||||||
|
"description": "Schema.org library for PHP",
|
||||||
|
"keywords": [
|
||||||
|
"JSON-LD",
|
||||||
|
"brick",
|
||||||
|
"microdata",
|
||||||
|
"rdfa lite",
|
||||||
|
"schema",
|
||||||
|
"schema.org",
|
||||||
|
"structured data"
|
||||||
|
],
|
||||||
|
"support": {
|
||||||
|
"issues": "https://github.com/brick/schema/issues",
|
||||||
|
"source": "https://github.com/brick/schema/tree/0.2.0"
|
||||||
|
},
|
||||||
|
"funding": [
|
||||||
|
{
|
||||||
|
"url": "https://github.com/BenMorel",
|
||||||
|
"type": "github"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"time": "2025-06-12T07:03:20+00:00"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "brick/structured-data",
|
||||||
|
"version": "0.2.0",
|
||||||
|
"source": {
|
||||||
|
"type": "git",
|
||||||
|
"url": "https://github.com/brick/structured-data.git",
|
||||||
|
"reference": "be9b28720e2aba87f19c90500700970be85affde"
|
||||||
|
},
|
||||||
|
"dist": {
|
||||||
|
"type": "zip",
|
||||||
|
"url": "https://api.github.com/repos/brick/structured-data/zipball/be9b28720e2aba87f19c90500700970be85affde",
|
||||||
|
"reference": "be9b28720e2aba87f19c90500700970be85affde",
|
||||||
|
"shasum": ""
|
||||||
|
},
|
||||||
|
"require": {
|
||||||
|
"ext-dom": "*",
|
||||||
|
"ext-json": "*",
|
||||||
|
"ext-libxml": "*",
|
||||||
|
"php": "^8.1",
|
||||||
|
"sabre/uri": "^2.1 || ^3.0"
|
||||||
|
},
|
||||||
|
"require-dev": {
|
||||||
|
"php-coveralls/php-coveralls": "^2.0",
|
||||||
|
"phpunit/phpunit": "^8.0 || ^9.0",
|
||||||
|
"vimeo/psalm": "6.12.0"
|
||||||
|
},
|
||||||
|
"type": "library",
|
||||||
|
"autoload": {
|
||||||
|
"psr-4": {
|
||||||
|
"Brick\\StructuredData\\": "src/"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"notification-url": "https://packagist.org/downloads/",
|
||||||
|
"license": [
|
||||||
|
"MIT"
|
||||||
|
],
|
||||||
|
"description": "Microdata, RDFa Lite & JSON-LD structured data reader",
|
||||||
|
"keywords": [
|
||||||
|
"JSON-LD",
|
||||||
|
"brick",
|
||||||
|
"microdata",
|
||||||
|
"rdfa",
|
||||||
|
"structured data"
|
||||||
|
],
|
||||||
|
"support": {
|
||||||
|
"issues": "https://github.com/brick/structured-data/issues",
|
||||||
|
"source": "https://github.com/brick/structured-data/tree/0.2.0"
|
||||||
|
},
|
||||||
|
"funding": [
|
||||||
|
{
|
||||||
|
"url": "https://github.com/BenMorel",
|
||||||
|
"type": "github"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"time": "2025-06-10T23:48:46+00:00"
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"name": "composer/ca-bundle",
|
"name": "composer/ca-bundle",
|
||||||
"version": "1.5.10",
|
"version": "1.5.10",
|
||||||
|
|
@ -9595,6 +9706,66 @@
|
||||||
},
|
},
|
||||||
"time": "2025-09-14T07:37:21+00:00"
|
"time": "2025-09-14T07:37:21+00:00"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"name": "sabre/uri",
|
||||||
|
"version": "3.0.2",
|
||||||
|
"source": {
|
||||||
|
"type": "git",
|
||||||
|
"url": "https://github.com/sabre-io/uri.git",
|
||||||
|
"reference": "38eeab6ed9eec435a2188db489d4649c56272c51"
|
||||||
|
},
|
||||||
|
"dist": {
|
||||||
|
"type": "zip",
|
||||||
|
"url": "https://api.github.com/repos/sabre-io/uri/zipball/38eeab6ed9eec435a2188db489d4649c56272c51",
|
||||||
|
"reference": "38eeab6ed9eec435a2188db489d4649c56272c51",
|
||||||
|
"shasum": ""
|
||||||
|
},
|
||||||
|
"require": {
|
||||||
|
"php": "^7.4 || ^8.0"
|
||||||
|
},
|
||||||
|
"require-dev": {
|
||||||
|
"friendsofphp/php-cs-fixer": "^3.64",
|
||||||
|
"phpstan/extension-installer": "^1.4",
|
||||||
|
"phpstan/phpstan": "^1.12",
|
||||||
|
"phpstan/phpstan-phpunit": "^1.4",
|
||||||
|
"phpstan/phpstan-strict-rules": "^1.6",
|
||||||
|
"phpunit/phpunit": "^9.6"
|
||||||
|
},
|
||||||
|
"type": "library",
|
||||||
|
"autoload": {
|
||||||
|
"files": [
|
||||||
|
"lib/functions.php"
|
||||||
|
],
|
||||||
|
"psr-4": {
|
||||||
|
"Sabre\\Uri\\": "lib/"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"notification-url": "https://packagist.org/downloads/",
|
||||||
|
"license": [
|
||||||
|
"BSD-3-Clause"
|
||||||
|
],
|
||||||
|
"authors": [
|
||||||
|
{
|
||||||
|
"name": "Evert Pot",
|
||||||
|
"email": "me@evertpot.com",
|
||||||
|
"homepage": "http://evertpot.com/",
|
||||||
|
"role": "Developer"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"description": "Functions for making sense out of URIs.",
|
||||||
|
"homepage": "http://sabre.io/uri/",
|
||||||
|
"keywords": [
|
||||||
|
"rfc3986",
|
||||||
|
"uri",
|
||||||
|
"url"
|
||||||
|
],
|
||||||
|
"support": {
|
||||||
|
"forum": "https://groups.google.com/group/sabredav-discuss",
|
||||||
|
"issues": "https://github.com/sabre-io/uri/issues",
|
||||||
|
"source": "https://github.com/fruux/sabre-uri"
|
||||||
|
},
|
||||||
|
"time": "2024-09-04T15:30:08+00:00"
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"name": "scheb/2fa-backup-code",
|
"name": "scheb/2fa-backup-code",
|
||||||
"version": "v7.13.1",
|
"version": "v7.13.1",
|
||||||
|
|
|
||||||
|
|
@ -32,6 +32,18 @@ use App\Services\InfoProviderSystem\DTOs\SearchResultDTO;
|
||||||
use App\Services\InfoProviderSystem\PartInfoRetriever;
|
use App\Services\InfoProviderSystem\PartInfoRetriever;
|
||||||
use App\Services\InfoProviderSystem\ProviderRegistry;
|
use App\Services\InfoProviderSystem\ProviderRegistry;
|
||||||
use App\Settings\InfoProviderSystem\GenericWebProviderSettings;
|
use App\Settings\InfoProviderSystem\GenericWebProviderSettings;
|
||||||
|
use Brick\Schema\Interfaces\ImageObject;
|
||||||
|
use Brick\Schema\Interfaces\Product;
|
||||||
|
use Brick\Schema\Interfaces\PropertyValue;
|
||||||
|
use Brick\Schema\Interfaces\QuantitativeValue;
|
||||||
|
use Brick\Schema\Interfaces\Thing;
|
||||||
|
use Brick\Schema\SchemaReader;
|
||||||
|
use Brick\Schema\SchemaTypeList;
|
||||||
|
use Brick\StructuredData\HTMLReader;
|
||||||
|
use Brick\StructuredData\Reader\JsonLdReader;
|
||||||
|
use Brick\StructuredData\Reader\MicrodataReader;
|
||||||
|
use Brick\StructuredData\Reader\RdfaLiteReader;
|
||||||
|
use Brick\StructuredData\Reader\ReaderChain;
|
||||||
use Symfony\Component\DomCrawler\Crawler;
|
use Symfony\Component\DomCrawler\Crawler;
|
||||||
use Symfony\Contracts\HttpClient\HttpClientInterface;
|
use Symfony\Contracts\HttpClient\HttpClientInterface;
|
||||||
|
|
||||||
|
|
@ -104,126 +116,122 @@ class GenericWebProvider implements InfoProviderInterface
|
||||||
return $host;
|
return $host;
|
||||||
}
|
}
|
||||||
|
|
||||||
private function productJsonLdToPart(array $jsonLd, string $url, Crawler $dom): PartDetailDTO
|
private function productToPart(Product $product, string $url, Crawler $dom): PartDetailDTO
|
||||||
{
|
{
|
||||||
$notes = $jsonLd['description'] ?? "";
|
$notes = $product->description->toString() ?? "";
|
||||||
if (isset($jsonLd['disambiguatingDescription'])) {
|
if ($product->disambiguatingDescription !== null) {
|
||||||
if (!empty($notes)) {
|
if (!empty($notes)) {
|
||||||
$notes .= "\n\n";
|
$notes .= "\n\n";
|
||||||
}
|
}
|
||||||
$notes .= $jsonLd['disambiguatingDescription'];
|
$notes .= $product->disambiguatingDescription->toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
//Extract vendor infos
|
||||||
$vendor_infos = null;
|
$vendor_infos = null;
|
||||||
if (isset($jsonLd['offers'])) {
|
$offer = $product->offers->getFirstValue();
|
||||||
|
if ($offer !== null) {
|
||||||
if (array_is_list($jsonLd['offers'])) {
|
|
||||||
$offer = $jsonLd['offers'][0];
|
|
||||||
} else {
|
|
||||||
$offer = $jsonLd['offers'];
|
|
||||||
}
|
|
||||||
|
|
||||||
//Make $jsonLd['url'] absolute if it's relative
|
|
||||||
if (isset($jsonLd['url']) && parse_url($jsonLd['url'], PHP_URL_SCHEME) === null) {
|
|
||||||
$parsedUrl = parse_url($url);
|
|
||||||
$scheme = $parsedUrl['scheme'] ?? 'https';
|
|
||||||
$host = $parsedUrl['host'] ?? '';
|
|
||||||
$jsonLd['url'] = $scheme.'://'.$host.$jsonLd['url'];
|
|
||||||
}
|
|
||||||
|
|
||||||
$prices = [];
|
$prices = [];
|
||||||
if (isset($offer['price'])) {
|
if ($offer->price->toString() !== null) {
|
||||||
$prices[] = new PriceDTO(
|
$prices = [new PriceDTO(
|
||||||
minimum_discount_amount: 1,
|
minimum_discount_amount: 1,
|
||||||
price: (string) $offer['price'],
|
price: $offer->price->toString(),
|
||||||
currency_iso_code: $offer['priceCurrency'] ?? null
|
currency_iso_code: $offer->priceCurrency?->toString()
|
||||||
);
|
)];
|
||||||
} else if (isset($offer['offers']) && array_is_list($offer['offers'])) {
|
} else { //Check for nested offers (like IKEA does it)
|
||||||
//Some sites nest offers
|
$offer2 = $offer->offers->getFirstValue();
|
||||||
foreach ($offer['offers'] as $subOffer) {
|
if ($offer2 !== null && $offer2->price->toString() !== null) {
|
||||||
if (isset($subOffer['price'])) {
|
$prices = [
|
||||||
$prices[] = new PriceDTO(
|
new PriceDTO(
|
||||||
minimum_discount_amount: 1,
|
minimum_discount_amount: 1,
|
||||||
price: (string) $subOffer['price'],
|
price: $offer2->price->toString(),
|
||||||
currency_iso_code: $subOffer['priceCurrency'] ?? null
|
currency_iso_code: $offer2->priceCurrency?->toString()
|
||||||
);
|
)
|
||||||
}
|
];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
$vendor_infos = [new PurchaseInfoDTO(
|
$vendor_infos = [new PurchaseInfoDTO(
|
||||||
distributor_name: $this->extractShopName($url),
|
distributor_name: $this->extractShopName($url),
|
||||||
order_number: (string) ($jsonLd['sku'] ?? $jsonLd['@id'] ?? $jsonLd['gtin'] ?? 'Unknown'),
|
order_number: $product->sku?->toString() ?? $product->identifier?->toString() ?? 'Unknown',
|
||||||
prices: $prices,
|
prices: $prices,
|
||||||
product_url: $jsonLd['url'] ?? $url,
|
product_url: $offer->url?->toString() ?? $url,
|
||||||
)];
|
)];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//Extract image:
|
||||||
$image = null;
|
$image = null;
|
||||||
if (isset($jsonLd['image'])) {
|
if ($product->image !== null) {
|
||||||
if (is_array($jsonLd['image'])) {
|
$imageObj = $product->image->getFirstValue();
|
||||||
if (array_is_list($jsonLd['image'])) {
|
if (is_string($imageObj)) {
|
||||||
$image = $jsonLd['image'][0] ?? null;
|
$image = $imageObj;
|
||||||
}
|
} else if ($imageObj instanceof ImageObject) {
|
||||||
} elseif (is_string($jsonLd['image'])) {
|
$image = $imageObj->contentUrl?->toString() ?? $imageObj->url?->toString();
|
||||||
$image = $jsonLd['image'];
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
//If image is an object with @type ImageObject, extract the url
|
|
||||||
if (is_array($image) && isset($image['@type']) && $image['@type'] === 'ImageObject') {
|
|
||||||
$image = $image['contentUrl'] ?? $image['url'] ?? null;
|
|
||||||
}
|
|
||||||
|
|
||||||
//Try to extract parameters from additionalProperty
|
//Extract parameters from additionalProperty
|
||||||
$parameters = [];
|
$parameters = [];
|
||||||
if (isset($jsonLd['additionalProperty']) && array_is_list($jsonLd['additionalProperty'])) {
|
foreach ($product->additionalProperty->getValues() as $property) {
|
||||||
foreach ($jsonLd['additionalProperty'] as $property) { //TODO: Handle minValue and maxValue
|
if ($property instanceof PropertyValue) { //TODO: Handle minValue and maxValue
|
||||||
if (isset ($property['unitText'])) {
|
if ($property->unitText->toString() !== null) {
|
||||||
$parameters[] = ParameterDTO::parseValueField(
|
$parameters[] = ParameterDTO::parseValueField(
|
||||||
name: $property['name'] ?? 'Unknown',
|
name: $property->name->toString() ?? 'Unknown',
|
||||||
value: $property['value'] ?? '',
|
value: $property->value->toString() ?? '',
|
||||||
unit: $property['unitText']
|
unit: $property->unitText->toString()
|
||||||
);
|
);
|
||||||
} else {
|
} else {
|
||||||
$parameters[] = ParameterDTO::parseValueIncludingUnit(
|
$parameters[] = ParameterDTO::parseValueIncludingUnit(
|
||||||
name: $property['name'] ?? 'Unknown',
|
name: $property->name->toString() ?? 'Unknown',
|
||||||
value: $property['value'] ?? ''
|
value: $property->value->toString() ?? ''
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//Try to extract weight
|
||||||
|
$mass = null;
|
||||||
|
if (($weight = $product?->weight->getFirstValue()) instanceof QuantitativeValue) {
|
||||||
|
$mass = $weight->value->toString();
|
||||||
|
}
|
||||||
|
|
||||||
return new PartDetailDTO(
|
return new PartDetailDTO(
|
||||||
provider_key: $this->getProviderKey(),
|
provider_key: $this->getProviderKey(),
|
||||||
provider_id: $url,
|
provider_id: $url,
|
||||||
name: $jsonLd ['name'] ?? 'Unknown Name',
|
name: $product->name?->toString() ?? $product->alternateName?->toString() ?? $product?->mpn->toString() ?? 'Unknown Name',
|
||||||
description: $this->getMetaContent($dom, 'og:description') ?? $this->getMetaContent($dom, 'description') ?? '',
|
description: $this->getMetaContent($dom, 'og:description') ?? $this->getMetaContent($dom, 'description') ?? '',
|
||||||
category: isset($jsonLd['category']) && is_string($jsonLd['category']) ? $jsonLd['category'] : null,
|
category: $product->category?->toString(),
|
||||||
manufacturer: $jsonLd['manufacturer']['name'] ?? $jsonLd['brand']['name'] ?? null,
|
manufacturer: self::propertyOrString($product->manufacturer) ?? self::propertyOrString($product->brand),
|
||||||
mpn: $jsonLd['mpn'] ?? null,
|
mpn: $product->mpn?->toString(),
|
||||||
preview_image_url: $image,
|
preview_image_url: $image,
|
||||||
provider_url: $url,
|
provider_url: $url,
|
||||||
notes: $notes,
|
notes: $notes,
|
||||||
parameters: $parameters,
|
parameters: $parameters,
|
||||||
vendor_infos: $vendor_infos,
|
vendor_infos: $vendor_infos,
|
||||||
mass: isset($jsonLd['weight']['value']) ? (float)$jsonLd['weight']['value'] : null,
|
mass: $mass
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
private static function propertyOrString(SchemaTypeList|Thing|string|null $value, string $property = "name"): ?string
|
||||||
* Decodes JSON in a forgiving way, trying to fix common issues.
|
|
||||||
* @param string $json
|
|
||||||
* @return array
|
|
||||||
* @throws \JsonException
|
|
||||||
*/
|
|
||||||
private function json_decode_forgiving(string $json): array
|
|
||||||
{
|
{
|
||||||
//Sanitize common issues
|
if ($value instanceof SchemaTypeList) {
|
||||||
$json = preg_replace("/[\r\n]+/", " ", $json);
|
$value = $value->getFirstValue();
|
||||||
return json_decode($json, true, 512, JSON_THROW_ON_ERROR);
|
}
|
||||||
|
if ($value === null) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (is_string($value)) {
|
||||||
|
return $value;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($value instanceof Thing) {
|
||||||
|
return $value->$property?->toString();
|
||||||
|
}
|
||||||
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Gets the content of a meta tag by its name or property attribute, or null if not found
|
* Gets the content of a meta tag by its name or property attribute, or null if not found
|
||||||
* @param Crawler $dom
|
* @param Crawler $dom
|
||||||
|
|
@ -336,18 +344,14 @@ class GenericWebProvider implements InfoProviderInterface
|
||||||
$canonicalURL = $scheme.'://'.$host.$canonicalURL;
|
$canonicalURL = $scheme.'://'.$host.$canonicalURL;
|
||||||
}
|
}
|
||||||
|
|
||||||
//Try to find json-ld data in the head
|
|
||||||
$jsonLdNodes = $dom->filter('script[type="application/ld+json"]');
|
$schemaReader = SchemaReader::forAllFormats();
|
||||||
foreach ($jsonLdNodes as $node) {
|
$things = $schemaReader->readHtml($content, $canonicalURL);
|
||||||
$jsonLd = $this->json_decode_forgiving($node->textContent);
|
|
||||||
//If the content of json-ld is an array, try to find a product inside
|
//Try to find a Product schema
|
||||||
if (!array_is_list($jsonLd)) {
|
foreach ($things as $thing) {
|
||||||
$jsonLd = [$jsonLd];
|
if ($thing instanceof Product) {
|
||||||
}
|
return $this->productToPart($thing, $canonicalURL, $dom);
|
||||||
foreach ($jsonLd as $item) {
|
|
||||||
if (isset($item['@type']) && $item['@type'] === 'Product') {
|
|
||||||
return $this->productJsonLdToPart($item, $canonicalURL, $dom);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue