From 14981200c82777f83c0980dca7e145647e89d200 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20B=C3=B6hmer?= Date: Sun, 1 Feb 2026 14:35:58 +0100 Subject: [PATCH 01/11] Started implementing a generic web provider which uses JSONLD data provided by a webshop page --- .../Providers/GenericWebProvider.php | 177 ++++++++++++++++++ 1 file changed, 177 insertions(+) create mode 100644 src/Services/InfoProviderSystem/Providers/GenericWebProvider.php diff --git a/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php b/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php new file mode 100644 index 00000000..6044e338 --- /dev/null +++ b/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php @@ -0,0 +1,177 @@ +. + */ + +declare(strict_types=1); + + +namespace App\Services\InfoProviderSystem\Providers; + +use App\Services\InfoProviderSystem\DTOs\PartDetailDTO; +use App\Services\InfoProviderSystem\DTOs\PriceDTO; +use App\Services\InfoProviderSystem\DTOs\PurchaseInfoDTO; +use PhpOffice\PhpSpreadsheet\Calculation\Financial\Securities\Price; +use Symfony\Component\DomCrawler\Crawler; +use Symfony\Contracts\HttpClient\HttpClientInterface; + +class GenericWebProvider implements InfoProviderInterface +{ + + public const DISTRIBUTOR_NAME = 'Website'; + + public function __construct(private readonly HttpClientInterface $httpClient) + { + + } + + public function getProviderInfo(): array + { + return [ + 'name' => 'Generic Web URL', + 'description' => 'Tries to extract a part from a given product', + //'url' => 'https://example.com', + 'disabled_help' => 'Enable in settings to use this provider' + ]; + } + + public function getProviderKey(): string + { + return 'generic_web'; + } + + public function isActive(): bool + { + return true; + } + + public function searchByKeyword(string $keyword): array + { + return [ + $this->getDetails($keyword) + ]; + } + + private function extractShopName(string $url): string + { + $host = parse_url($url, PHP_URL_HOST); + if ($host === false || $host === null) { + return self::DISTRIBUTOR_NAME; + } + return $host; + } + + private function productJsonLdToPart(array $jsonLd, string $url): PartDetailDTO + { + $notes = $jsonLd['description'] ?? ""; + if (isset($jsonLd['disambiguatingDescription'])) { + if (!empty($notes)) { + $notes .= "\n\n"; + } + $notes .= $jsonLd['disambiguatingDescription']; + } + + $vendor_infos = null; + if (isset($jsonLd['offers'])) { + $vendor_infos = [new PurchaseInfoDTO( + distributor_name: $this->extractShopName($url), + order_number: $jsonLd['sku'] ?? $jsonLd['@id'] ?? $jsonLd['gtin'] ?? 'Unknown', + prices: [new PriceDTO(minimum_discount_amount: 1, price: (string) $jsonLd['offers']['price'], currency_iso_code: $jsonLd['offers']['priceCurrency'] ?? null)], + product_url: $jsonLd['url'] ?? $url, + )]; + } + + $image = null; + if (isset($jsonLd['image'])) { + if (is_array($jsonLd['image'])) { + $image = $jsonLd['image'][0] ?? null; + } elseif (is_string($jsonLd['image'])) { + $image = $jsonLd['image']; + } + } + + return new PartDetailDTO( + provider_key: $this->getProviderKey(), + provider_id: $url, + name: $jsonLd ['name'] ?? 'Unknown Name', + description: '', + category: isset($jsonLd['category']) && is_string($jsonLd['category']) ? $jsonLd['category'] : null, + manufacturer: $jsonLd['manufacturer']['name'] ?? $jsonLd['brand']['name'] ?? null, + mpn: $jsonLd['mpn'] ?? null, + preview_image_url: $image, + provider_url: $url, + notes: $notes, + vendor_infos: $vendor_infos, + mass: isset($jsonLd['weight']['value']) ? (float)$jsonLd['weight']['value'] : null, + ); + } + + /** + * Decodes JSON in a forgiving way, trying to fix common issues. + * @param string $json + * @return array + * @throws \JsonException + */ + private function json_decode_forgiving(string $json): array + { + //Sanitize common issues + $json = preg_replace("/[\r\n]+/", " ", $json); + return json_decode($json, true, 512, JSON_THROW_ON_ERROR); + } + + public function getDetails(string $id): PartDetailDTO + { + $url = $id; + + //Try to get the webpage content + $response = $this->httpClient->request('GET', $url); + $content = $response->getContent(); + + $dom = new Crawler($content); + + //Try to determine a canonical URL + $canonicalURL = $url; + if ($dom->filter('link[rel="canonical"]')->count() > 0) { + $canonicalURL = $dom->filter('link[rel="canonical"]')->attr('href'); + } else if ($dom->filter('meta[property="og:url"]')->count() > 0) { + $canonicalURL = $dom->filter('meta[property="og:url"]')->attr('content'); + } + + //Try to find json-ld data in the head + $jsonLdNodes = $dom->filter('head script[type="application/ld+json"]'); + foreach ($jsonLdNodes as $node) { + $jsonLd = $this->json_decode_forgiving($node->textContent); + if (isset($jsonLd['@type']) && $jsonLd['@type'] === 'Product') { //If we find a product use that data + return $this->productJsonLdToPart($jsonLd, $canonicalURL); + } + } + + + + return null; + } + + public function getCapabilities(): array + { + return [ + ProviderCapabilities::BASIC, + ProviderCapabilities::PICTURE, + ProviderCapabilities::PRICE + ]; + } +} From b89e878871eaa81ab0293995b9f1b32e7e4029b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20B=C3=B6hmer?= Date: Sun, 1 Feb 2026 16:39:19 +0100 Subject: [PATCH 02/11] Allow to rudimentary parse product pages, even if they do not contain JSON-LD data --- .../Providers/GenericWebProvider.php | 67 ++++++++++++++++--- 1 file changed, 59 insertions(+), 8 deletions(-) diff --git a/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php b/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php index 6044e338..2704ad99 100644 --- a/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php +++ b/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php @@ -35,9 +35,18 @@ class GenericWebProvider implements InfoProviderInterface public const DISTRIBUTOR_NAME = 'Website'; - public function __construct(private readonly HttpClientInterface $httpClient) - { + private readonly HttpClientInterface $httpClient; + public function __construct(HttpClientInterface $httpClient) + { + $this->httpClient = $httpClient->withOptions( + [ + 'headers' => [ + 'User-Agent' => 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/144.0.0.0 Safari/537.36', + ], + 'timeout' => 15, + ] + ); } public function getProviderInfo(): array @@ -76,7 +85,7 @@ class GenericWebProvider implements InfoProviderInterface return $host; } - private function productJsonLdToPart(array $jsonLd, string $url): PartDetailDTO + private function productJsonLdToPart(array $jsonLd, string $url, Crawler $dom): PartDetailDTO { $notes = $jsonLd['description'] ?? ""; if (isset($jsonLd['disambiguatingDescription'])) { @@ -109,7 +118,7 @@ class GenericWebProvider implements InfoProviderInterface provider_key: $this->getProviderKey(), provider_id: $url, name: $jsonLd ['name'] ?? 'Unknown Name', - description: '', + description: $this->getMetaContent($dom, 'og:description') ?? $this->getMetaContent($dom, 'description') ?? '', category: isset($jsonLd['category']) && is_string($jsonLd['category']) ? $jsonLd['category'] : null, manufacturer: $jsonLd['manufacturer']['name'] ?? $jsonLd['brand']['name'] ?? null, mpn: $jsonLd['mpn'] ?? null, @@ -134,6 +143,22 @@ class GenericWebProvider implements InfoProviderInterface return json_decode($json, true, 512, JSON_THROW_ON_ERROR); } + private function getMetaContent(Crawler $dom, string $name): ?string + { + $meta = $dom->filter('meta[property="'.$name.'"]'); + if ($meta->count() > 0) { + return $meta->attr('content'); + } + + //Try name attribute + $meta = $dom->filter('meta[name="'.$name.'"]'); + if ($meta->count() > 0) { + return $meta->attr('content'); + } + + return null; + } + public function getDetails(string $id): PartDetailDTO { $url = $id; @@ -153,17 +178,43 @@ class GenericWebProvider implements InfoProviderInterface } //Try to find json-ld data in the head - $jsonLdNodes = $dom->filter('head script[type="application/ld+json"]'); + $jsonLdNodes = $dom->filter('script[type="application/ld+json"]'); foreach ($jsonLdNodes as $node) { $jsonLd = $this->json_decode_forgiving($node->textContent); if (isset($jsonLd['@type']) && $jsonLd['@type'] === 'Product') { //If we find a product use that data - return $this->productJsonLdToPart($jsonLd, $canonicalURL); + return $this->productJsonLdToPart($jsonLd, $canonicalURL, $dom); } } - + //If no JSON-LD data is found, try to extract basic data from meta tags + $pageTitle = $dom->filter('title')->count() > 0 ? $dom->filter('title')->text() : 'Unknown'; - return null; + $prices = []; + if ($price = $this->getMetaContent($dom, 'product:price:amount')) { + $prices[] = new PriceDTO( + minimum_discount_amount: 1, + price: $price, + currency_iso_code: $this->getMetaContent($dom, 'product:price:currency'), + ); + } + + $vendor_infos = [new PurchaseInfoDTO( + distributor_name: $this->extractShopName($canonicalURL), + order_number: 'Unknown', + prices: $prices, + product_url: $canonicalURL, + )]; + + return new PartDetailDTO( + provider_key: $this->getProviderKey(), + provider_id: $canonicalURL, + name: $this->getMetaContent($dom, 'og:title') ?? $pageTitle, + description: $this->getMetaContent($dom, 'og:description') ?? $this->getMetaContent($dom, 'description') ?? '', + manufacturer: $this->getMetaContent($dom, 'product:brand'), + preview_image_url: $this->getMetaContent($dom, 'og:image'), + provider_url: $canonicalURL, + vendor_infos: $vendor_infos, + ); } public function getCapabilities(): array From 73dbe64a83ad8ae1562bb3d1b137bfabc302ecde Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20B=C3=B6hmer?= Date: Sun, 1 Feb 2026 16:51:26 +0100 Subject: [PATCH 03/11] Allow to extract prices form an Amazon page --- .../Providers/GenericWebProvider.php | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php b/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php index 2704ad99..09d135b8 100644 --- a/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php +++ b/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php @@ -196,6 +196,16 @@ class GenericWebProvider implements InfoProviderInterface price: $price, currency_iso_code: $this->getMetaContent($dom, 'product:price:currency'), ); + } else { + //Amazon fallback + $amazonAmount = $dom->filter('input[type="hidden"][name*="amount"]'); + if ($amazonAmount->count() > 0) { + $prices[] = new PriceDTO( + minimum_discount_amount: 1, + price: $amazonAmount->first()->attr('value'), + currency_iso_code: $dom->filter('input[type="hidden"][name*="currencyCode"]')->first()->attr('value'), + ); + } } $vendor_infos = [new PurchaseInfoDTO( From 52be5481702516f124dce5e75923ad172737e501 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20B=C3=B6hmer?= Date: Sun, 1 Feb 2026 16:55:52 +0100 Subject: [PATCH 04/11] Add https:// if not existing --- .../InfoProviderSystem/Providers/GenericWebProvider.php | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php b/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php index 09d135b8..17457c75 100644 --- a/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php +++ b/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php @@ -161,6 +161,14 @@ class GenericWebProvider implements InfoProviderInterface public function getDetails(string $id): PartDetailDTO { + //Add scheme if missing + if (!preg_match('/^https?:\/\//', $id)) { + //Remove any leading slashes + $id = ltrim($id, '/'); + + $id = 'https://'.$id; + } + $url = $id; //Try to get the webpage content From d868225260316aa303749f1cd72173fe58f2c32c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20B=C3=B6hmer?= Date: Sun, 1 Feb 2026 17:06:38 +0100 Subject: [PATCH 05/11] Properly parse JSONLD product data if it is in an array with others --- .../Providers/GenericWebProvider.php | 21 +++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php b/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php index 17457c75..8b976b50 100644 --- a/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php +++ b/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php @@ -97,10 +97,17 @@ class GenericWebProvider implements InfoProviderInterface $vendor_infos = null; if (isset($jsonLd['offers'])) { + + if (array_is_list($jsonLd['offers'])) { + $offer = $jsonLd['offers'][0]; + } else { + $offer = $jsonLd['offers']; + } + $vendor_infos = [new PurchaseInfoDTO( distributor_name: $this->extractShopName($url), - order_number: $jsonLd['sku'] ?? $jsonLd['@id'] ?? $jsonLd['gtin'] ?? 'Unknown', - prices: [new PriceDTO(minimum_discount_amount: 1, price: (string) $jsonLd['offers']['price'], currency_iso_code: $jsonLd['offers']['priceCurrency'] ?? null)], + order_number: (string) ($jsonLd['sku'] ?? $jsonLd['@id'] ?? $jsonLd['gtin'] ?? 'Unknown'), + prices: [new PriceDTO(minimum_discount_amount: 1, price: (string) $offer['price'], currency_iso_code: $offer['priceCurrency'] ?? null)], product_url: $jsonLd['url'] ?? $url, )]; } @@ -189,8 +196,14 @@ class GenericWebProvider implements InfoProviderInterface $jsonLdNodes = $dom->filter('script[type="application/ld+json"]'); foreach ($jsonLdNodes as $node) { $jsonLd = $this->json_decode_forgiving($node->textContent); - if (isset($jsonLd['@type']) && $jsonLd['@type'] === 'Product') { //If we find a product use that data - return $this->productJsonLdToPart($jsonLd, $canonicalURL, $dom); + //If the content of json-ld is an array, try to find a product inside + if (!array_is_list($jsonLd)) { + $jsonLd = [$jsonLd]; + } + foreach ($jsonLd as $item) { + if (isset($item['@type']) && $item['@type'] === 'Product') { + return $this->productJsonLdToPart($item, $canonicalURL, $dom); + } } } From 1213f82cdf73850d783f2ad56611341293ad8756 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20B=C3=B6hmer?= Date: Sun, 1 Feb 2026 17:11:41 +0100 Subject: [PATCH 06/11] Fix if canonical URL is relative --- .../Providers/GenericWebProvider.php | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php b/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php index 8b976b50..3c657989 100644 --- a/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php +++ b/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php @@ -104,6 +104,14 @@ class GenericWebProvider implements InfoProviderInterface $offer = $jsonLd['offers']; } + //Make $jsonLd['url'] absolute if it's relative + if (isset($jsonLd['url']) && parse_url($jsonLd['url'], PHP_URL_SCHEME) === null) { + $parsedUrl = parse_url($url); + $scheme = $parsedUrl['scheme'] ?? 'https'; + $host = $parsedUrl['host'] ?? ''; + $jsonLd['url'] = $scheme.'://'.$host.$jsonLd['url']; + } + $vendor_infos = [new PurchaseInfoDTO( distributor_name: $this->extractShopName($url), order_number: (string) ($jsonLd['sku'] ?? $jsonLd['@id'] ?? $jsonLd['gtin'] ?? 'Unknown'), @@ -192,6 +200,14 @@ class GenericWebProvider implements InfoProviderInterface $canonicalURL = $dom->filter('meta[property="og:url"]')->attr('content'); } + //If the canonical URL is relative, make it absolute + if (parse_url($canonicalURL, PHP_URL_SCHEME) === null) { + $parsedUrl = parse_url($url); + $scheme = $parsedUrl['scheme'] ?? 'https'; + $host = $parsedUrl['host'] ?? ''; + $canonicalURL = $scheme.'://'.$host.$canonicalURL; + } + //Try to find json-ld data in the head $jsonLdNodes = $dom->filter('script[type="application/ld+json"]'); foreach ($jsonLdNodes as $node) { From 7feba634b8f8195e583b851c669b19a35aa011e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20B=C3=B6hmer?= Date: Sun, 1 Feb 2026 17:20:13 +0100 Subject: [PATCH 07/11] Hadle if offers are nested and images are ImageObjects in JSON+LD --- .../Providers/GenericWebProvider.php | 30 +++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php b/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php index 3c657989..a098c685 100644 --- a/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php +++ b/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php @@ -112,10 +112,30 @@ class GenericWebProvider implements InfoProviderInterface $jsonLd['url'] = $scheme.'://'.$host.$jsonLd['url']; } + $prices = []; + if (isset($offer['price'])) { + $prices[] = new PriceDTO( + minimum_discount_amount: 1, + price: (string) $offer['price'], + currency_iso_code: $offer['priceCurrency'] ?? null + ); + } else if (isset($offer['offers']) && array_is_list($offer['offers'])) { + //Some sites nest offers + foreach ($offer['offers'] as $subOffer) { + if (isset($subOffer['price'])) { + $prices[] = new PriceDTO( + minimum_discount_amount: 1, + price: (string) $subOffer['price'], + currency_iso_code: $subOffer['priceCurrency'] ?? null + ); + } + } + } + $vendor_infos = [new PurchaseInfoDTO( distributor_name: $this->extractShopName($url), order_number: (string) ($jsonLd['sku'] ?? $jsonLd['@id'] ?? $jsonLd['gtin'] ?? 'Unknown'), - prices: [new PriceDTO(minimum_discount_amount: 1, price: (string) $offer['price'], currency_iso_code: $offer['priceCurrency'] ?? null)], + prices: $prices, product_url: $jsonLd['url'] ?? $url, )]; } @@ -123,11 +143,17 @@ class GenericWebProvider implements InfoProviderInterface $image = null; if (isset($jsonLd['image'])) { if (is_array($jsonLd['image'])) { - $image = $jsonLd['image'][0] ?? null; + if (array_is_list($jsonLd['image'])) { + $image = $jsonLd['image'][0] ?? null; + } } elseif (is_string($jsonLd['image'])) { $image = $jsonLd['image']; } } + //If image is an object with @type ImageObject, extract the url + if (is_array($image) && isset($image['@type']) && $image['@type'] === 'ImageObject') { + $image = $image['contentUrl'] ?? $image['url'] ?? null; + } return new PartDetailDTO( provider_key: $this->getProviderKey(), From 071f6f85913ab4a5c496a06e46939ab603c0a77f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20B=C3=B6hmer?= Date: Sun, 1 Feb 2026 17:34:08 +0100 Subject: [PATCH 08/11] Return an empty array if no URL is provider to the Generic Web URL provider --- .../ProviderIDNotSupportedException.php | 32 +++++++++++++++++++ .../Providers/GenericWebProvider.php | 13 +++++++- 2 files changed, 44 insertions(+), 1 deletion(-) create mode 100644 src/Exceptions/ProviderIDNotSupportedException.php diff --git a/src/Exceptions/ProviderIDNotSupportedException.php b/src/Exceptions/ProviderIDNotSupportedException.php new file mode 100644 index 00000000..429f43ea --- /dev/null +++ b/src/Exceptions/ProviderIDNotSupportedException.php @@ -0,0 +1,32 @@ +. + */ + +declare(strict_types=1); + + +namespace App\Exceptions; + +class ProviderIDNotSupportedException extends \RuntimeException +{ + public function fromProvider(string $providerKey, string $id): self + { + return new self(sprintf('The given ID %s is not supported by the provider %s.', $id, $providerKey,)); + } +} diff --git a/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php b/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php index a098c685..248b4f0e 100644 --- a/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php +++ b/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php @@ -23,6 +23,7 @@ declare(strict_types=1); namespace App\Services\InfoProviderSystem\Providers; +use App\Exceptions\ProviderIDNotSupportedException; use App\Services\InfoProviderSystem\DTOs\PartDetailDTO; use App\Services\InfoProviderSystem\DTOs\PriceDTO; use App\Services\InfoProviderSystem\DTOs\PurchaseInfoDTO; @@ -71,9 +72,12 @@ class GenericWebProvider implements InfoProviderInterface public function searchByKeyword(string $keyword): array { + try { return [ $this->getDetails($keyword) - ]; + ]; } catch (ProviderIDNotSupportedException $e) { + return []; + } } private function extractShopName(string $url): string @@ -212,6 +216,13 @@ class GenericWebProvider implements InfoProviderInterface $url = $id; + //If this is not a valid URL with host, domain and path, throw an exception + if (filter_var($url, FILTER_VALIDATE_URL) === false || + parse_url($url, PHP_URL_HOST) === null || + parse_url($url, PHP_URL_PATH) === null) { + throw new ProviderIDNotSupportedException("The given ID is not a valid URL: ".$id); + } + //Try to get the webpage content $response = $this->httpClient->request('GET', $url); $content = $response->getContent(); From 722eb7ddab0e8a00d36845b68eaff82cde2f1f20 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20B=C3=B6hmer?= Date: Sun, 1 Feb 2026 17:47:04 +0100 Subject: [PATCH 09/11] Added settings and docs for the generic Web info provider --- docs/usage/information_provider_system.md | 15 +++++++ .../Providers/GenericWebProvider.php | 10 +++-- .../GenericWebProviderSettings.php | 43 +++++++++++++++++++ .../InfoProviderSettings.php | 3 ++ .../settings/provider_settings.html.twig | 2 +- translations/messages.en.xlf | 18 ++++++++ 6 files changed, 86 insertions(+), 5 deletions(-) create mode 100644 src/Settings/InfoProviderSystem/GenericWebProviderSettings.php diff --git a/docs/usage/information_provider_system.md b/docs/usage/information_provider_system.md index da8ea32b..6cdb5183 100644 --- a/docs/usage/information_provider_system.md +++ b/docs/usage/information_provider_system.md @@ -96,6 +96,21 @@ The following providers are currently available and shipped with Part-DB: (All trademarks are property of their respective owners. Part-DB is not affiliated with any of the companies.) +### Generic Web URL Provider +The Generic Web URL Provider can extract part information from any webpage that contains structured data in the form of +[Schema.org](https://schema.org/) format. Many e-commerce websites use this format to provide detailed product information +for search engines and other services. Therefore it allows Part-DB to retrieve rudimentary part information (like name, image and price) +from a wide range of websites without the need for a dedicated API integration. +To use the Generic Web URL Provider, simply enable it in the information provider settings. No additional configuration +is required. Afterwards you can enter any product URL in the search field, and Part-DB will attempt to extract the relevant part information +from the webpage. + +Please note that if this provider is enabled, Part-DB will make HTTP requests to external websites to fetch product data, which +may have privacy and security implications. + +Following env configuration options are available: +* `PROVIDER_GENERIC_WEB_ENABLED`: Set this to `1` to enable the Generic Web URL Provider (optional, default: `0`) + ### Octopart The Octopart provider uses the [Octopart / Nexar API](https://nexar.com/api) to search for parts and get information. diff --git a/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php b/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php index 248b4f0e..d05aac8f 100644 --- a/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php +++ b/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php @@ -27,6 +27,7 @@ use App\Exceptions\ProviderIDNotSupportedException; use App\Services\InfoProviderSystem\DTOs\PartDetailDTO; use App\Services\InfoProviderSystem\DTOs\PriceDTO; use App\Services\InfoProviderSystem\DTOs\PurchaseInfoDTO; +use App\Settings\InfoProviderSystem\GenericWebProviderSettings; use PhpOffice\PhpSpreadsheet\Calculation\Financial\Securities\Price; use Symfony\Component\DomCrawler\Crawler; use Symfony\Contracts\HttpClient\HttpClientInterface; @@ -38,7 +39,7 @@ class GenericWebProvider implements InfoProviderInterface private readonly HttpClientInterface $httpClient; - public function __construct(HttpClientInterface $httpClient) + public function __construct(HttpClientInterface $httpClient, private readonly GenericWebProviderSettings $settings) { $this->httpClient = $httpClient->withOptions( [ @@ -54,9 +55,10 @@ class GenericWebProvider implements InfoProviderInterface { return [ 'name' => 'Generic Web URL', - 'description' => 'Tries to extract a part from a given product', + 'description' => 'Tries to extract a part from a given product webpage URL using common metadata standards like JSON-LD and OpenGraph.', //'url' => 'https://example.com', - 'disabled_help' => 'Enable in settings to use this provider' + 'disabled_help' => 'Enable in settings to use this provider', + 'settings_class' => GenericWebProviderSettings::class, ]; } @@ -67,7 +69,7 @@ class GenericWebProvider implements InfoProviderInterface public function isActive(): bool { - return true; + return $this->settings->enabled; } public function searchByKeyword(string $keyword): array diff --git a/src/Settings/InfoProviderSystem/GenericWebProviderSettings.php b/src/Settings/InfoProviderSystem/GenericWebProviderSettings.php new file mode 100644 index 00000000..064d8a1c --- /dev/null +++ b/src/Settings/InfoProviderSystem/GenericWebProviderSettings.php @@ -0,0 +1,43 @@ +. + */ + +declare(strict_types=1); + + +namespace App\Settings\InfoProviderSystem; + +use App\Settings\SettingsIcon; +use Jbtronics\SettingsBundle\Metadata\EnvVarMode; +use Jbtronics\SettingsBundle\Settings\Settings; +use Jbtronics\SettingsBundle\Settings\SettingsParameter; +use Jbtronics\SettingsBundle\Settings\SettingsTrait; +use Symfony\Component\Translation\TranslatableMessage as TM; + +#[Settings(label: new TM("settings.ips.generic_web_provider"), description: new TM("settings.ips.generic_web_provider.description"))] +#[SettingsIcon("fa-plug")] +class GenericWebProviderSettings +{ + use SettingsTrait; + + #[SettingsParameter(label: new TM("settings.ips.lcsc.enabled"), description: new TM("settings.ips.generic_web_provider.enabled.help"), + envVar: "bool:PROVIDER_GENERIC_WEB_ENABLED", envVarMode: EnvVarMode::OVERWRITE + )] + public bool $enabled = false; +} diff --git a/src/Settings/InfoProviderSystem/InfoProviderSettings.php b/src/Settings/InfoProviderSystem/InfoProviderSettings.php index fb31bdb9..3e78233f 100644 --- a/src/Settings/InfoProviderSystem/InfoProviderSettings.php +++ b/src/Settings/InfoProviderSystem/InfoProviderSettings.php @@ -37,6 +37,9 @@ class InfoProviderSettings #[EmbeddedSettings] public ?InfoProviderGeneralSettings $general = null; + #[EmbeddedSettings] + public ?GenericWebProviderSettings $genericWebProvider = null; + #[EmbeddedSettings] public ?DigikeySettings $digikey = null; diff --git a/templates/info_providers/settings/provider_settings.html.twig b/templates/info_providers/settings/provider_settings.html.twig index 1876c2eb..86e5bc9b 100644 --- a/templates/info_providers/settings/provider_settings.html.twig +++ b/templates/info_providers/settings/provider_settings.html.twig @@ -10,7 +10,7 @@ {% block card_content %}

- {% if info_provider_info.url %} + {% if info_provider_info.url is defined %} {{ info_provider_info.name }} {% else %} {{ info_provider_info.name }} diff --git a/translations/messages.en.xlf b/translations/messages.en.xlf index b2bd908e..706d629a 100644 --- a/translations/messages.en.xlf +++ b/translations/messages.en.xlf @@ -14316,5 +14316,23 @@ Buerklin-API Authentication server: Only includes attachments in the selected languages in the results. + + + settings.ips.generic_web_provider + Generic Web URL Provider + + + + + settings.ips.generic_web_provider.description + This info provider allows to retrieve basic part information from many shop page URLs. + + + + + settings.ips.generic_web_provider.enabled.help + When the provider is enabled, users can make requests to arbitary websites on behalf of the Part-DB server. Only enable this, if you are aware of the potential consequences. + + From 909cab004431be9c89a76c888bb23614b70f9507 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20B=C3=B6hmer?= Date: Sun, 1 Feb 2026 18:18:58 +0100 Subject: [PATCH 10/11] Added an web page to quickly add a new part from a web URL --- src/Controller/InfoProviderController.php | 56 +++++++++++++++++++ src/Services/Trees/ToolsTreeBuilder.php | 10 ++++ .../GenericWebProviderSettings.php | 2 +- templates/_navbar.html.twig | 16 ++++-- .../from_url/from_url.html.twig | 21 +++++++ translations/messages.en.xlf | 24 ++++++++ 6 files changed, 124 insertions(+), 5 deletions(-) create mode 100644 templates/info_providers/from_url/from_url.html.twig diff --git a/src/Controller/InfoProviderController.php b/src/Controller/InfoProviderController.php index e5a5d87b..deec8a57 100644 --- a/src/Controller/InfoProviderController.php +++ b/src/Controller/InfoProviderController.php @@ -30,6 +30,7 @@ use App\Form\InfoProviderSystem\PartSearchType; use App\Services\InfoProviderSystem\ExistingPartFinder; use App\Services\InfoProviderSystem\PartInfoRetriever; use App\Services\InfoProviderSystem\ProviderRegistry; +use App\Services\InfoProviderSystem\Providers\GenericWebProvider; use App\Settings\AppSettings; use App\Settings\InfoProviderSystem\InfoProviderGeneralSettings; use Doctrine\ORM\EntityManagerInterface; @@ -39,6 +40,7 @@ use Psr\Log\LoggerInterface; use Symfony\Bridge\Doctrine\Attribute\MapEntity; use Symfony\Bundle\FrameworkBundle\Controller\AbstractController; use Symfony\Component\Form\Extension\Core\Type\SubmitType; +use Symfony\Component\Form\Extension\Core\Type\UrlType; use Symfony\Component\HttpClient\Exception\ClientException; use Symfony\Component\HttpClient\Exception\TransportException; use Symfony\Component\HttpFoundation\Request; @@ -208,4 +210,58 @@ class InfoProviderController extends AbstractController 'update_target' => $update_target ]); } + + #[Route('/from_url', name: 'info_providers_from_url')] + public function fromURL(Request $request, GenericWebProvider $provider): Response + { + $this->denyAccessUnlessGranted('@info_providers.create_parts'); + + if (!$provider->isActive()) { + $this->addFlash('error', "Generic Web Provider is not active. Please enable it in the provider settings."); + return $this->redirectToRoute('info_providers_list'); + } + + $formBuilder = $this->createFormBuilder(); + $formBuilder->add('url', UrlType::class, [ + 'label' => 'info_providers.from_url.url.label', + 'required' => true, + ]); + $formBuilder->add('submit', SubmitType::class, [ + 'label' => 'info_providers.search.submit', + ]); + + $form = $formBuilder->getForm(); + $form->handleRequest($request); + + $partDetail = null; + if ($form->isSubmitted() && $form->isValid()) { + //Try to retrieve the part detail from the given URL + $url = $form->get('url')->getData(); + try { + $searchResult = $this->infoRetriever->searchByKeyword( + keyword: $url, + providers: [$provider] + ); + + if (count($searchResult) === 0) { + $this->addFlash('warning', t('info_providers.from_url.no_part_found')); + } else { + $searchResult = $searchResult[0]; + //Redirect to the part creation page with the found part detail + return $this->redirectToRoute('info_providers_create_part', [ + 'providerKey' => $searchResult->provider_key, + 'providerId' => $searchResult->provider_id, + ]); + } + } catch (ExceptionInterface $e) { + $this->addFlash('error', t('info_providers.search.error.general_exception', ['%type%' => (new \ReflectionClass($e))->getShortName()])); + } + } + + return $this->render('info_providers/from_url/from_url.html.twig', [ + 'form' => $form, + 'partDetail' => $partDetail, + ]); + + } } diff --git a/src/Services/Trees/ToolsTreeBuilder.php b/src/Services/Trees/ToolsTreeBuilder.php index 37a09b09..c8afac12 100644 --- a/src/Services/Trees/ToolsTreeBuilder.php +++ b/src/Services/Trees/ToolsTreeBuilder.php @@ -39,6 +39,8 @@ use App\Entity\UserSystem\User; use App\Helpers\Trees\TreeViewNode; use App\Services\Cache\UserCacheKeyGenerator; use App\Services\ElementTypeNameGenerator; +use App\Services\InfoProviderSystem\Providers\GenericWebProvider; +use App\Settings\InfoProviderSystem\GenericWebProviderSettings; use Symfony\Bundle\SecurityBundle\Security; use Symfony\Component\Routing\Generator\UrlGeneratorInterface; use Symfony\Contracts\Cache\ItemInterface; @@ -58,6 +60,7 @@ class ToolsTreeBuilder protected UserCacheKeyGenerator $keyGenerator, protected Security $security, private readonly ElementTypeNameGenerator $elementTypeNameGenerator, + private readonly GenericWebProviderSettings $genericWebProviderSettings ) { } @@ -147,6 +150,13 @@ class ToolsTreeBuilder $this->urlGenerator->generate('info_providers_search') ))->setIcon('fa-treeview fa-fw fa-solid fa-cloud-arrow-down'); + if ($this->genericWebProviderSettings->enabled) { + $nodes[] = (new TreeViewNode( + $this->translator->trans('info_providers.from_url.title'), + $this->urlGenerator->generate('info_providers_from_url') + ))->setIcon('fa-treeview fa-fw fa-solid fa-book-atlas'); + } + $nodes[] = (new TreeViewNode( $this->translator->trans('info_providers.bulk_import.manage_jobs'), $this->urlGenerator->generate('bulk_info_provider_manage') diff --git a/src/Settings/InfoProviderSystem/GenericWebProviderSettings.php b/src/Settings/InfoProviderSystem/GenericWebProviderSettings.php index 064d8a1c..07972141 100644 --- a/src/Settings/InfoProviderSystem/GenericWebProviderSettings.php +++ b/src/Settings/InfoProviderSystem/GenericWebProviderSettings.php @@ -30,7 +30,7 @@ use Jbtronics\SettingsBundle\Settings\SettingsParameter; use Jbtronics\SettingsBundle\Settings\SettingsTrait; use Symfony\Component\Translation\TranslatableMessage as TM; -#[Settings(label: new TM("settings.ips.generic_web_provider"), description: new TM("settings.ips.generic_web_provider.description"))] +#[Settings(name: "generic_web_provider", label: new TM("settings.ips.generic_web_provider"), description: new TM("settings.ips.generic_web_provider.description"))] #[SettingsIcon("fa-plug")] class GenericWebProviderSettings { diff --git a/templates/_navbar.html.twig b/templates/_navbar.html.twig index 446ccdab..c4dfbe0f 100644 --- a/templates/_navbar.html.twig +++ b/templates/_navbar.html.twig @@ -10,9 +10,9 @@ - {% if is_granted("@tools.label_scanner") %} + {% if is_granted("@tools.label_scanner") %} - + {% endif %}

@@ -52,6 +52,14 @@ {% trans %}info_providers.search.title{% endtrans %} + {% if settings_instance('generic_web_provider').enabled %} +
  • + + + {% trans %}info_providers.from_url.title{% endtrans %} + +
  • + {% endif %} {% endif %} {% if is_granted('@parts.import') %} @@ -69,7 +77,7 @@ {% if is_granted('@parts.read') %} {{ search.search_form("navbar") }} - {# {% include "_navbar_search.html.twig" %} #} + {# {% include "_navbar_search.html.twig" %} #} {% endif %} @@ -145,4 +153,4 @@ - \ No newline at end of file + diff --git a/templates/info_providers/from_url/from_url.html.twig b/templates/info_providers/from_url/from_url.html.twig new file mode 100644 index 00000000..5aad1a03 --- /dev/null +++ b/templates/info_providers/from_url/from_url.html.twig @@ -0,0 +1,21 @@ +{% extends "main_card.html.twig" %} + +{% import "info_providers/providers.macro.html.twig" as providers_macro %} +{% import "helper.twig" as helper %} + +{% block title %} + {% trans %}info_providers.from_url.title{% endtrans %} +{% endblock %} + +{% block card_title %} + {% trans %}info_providers.from_url.title{% endtrans %} +{% endblock %} + +{% block card_content %} +

    {% trans %}info_providers.from_url.help{% endtrans %}

    + + {{ form_start(form) }} + {{ form_row(form.url) }} + {{ form_row(form.submit) }} + {{ form_end(form) }} +{% endblock %} diff --git a/translations/messages.en.xlf b/translations/messages.en.xlf index 706d629a..87f6c2f6 100644 --- a/translations/messages.en.xlf +++ b/translations/messages.en.xlf @@ -14334,5 +14334,29 @@ Buerklin-API Authentication server: When the provider is enabled, users can make requests to arbitary websites on behalf of the Part-DB server. Only enable this, if you are aware of the potential consequences. + + + info_providers.from_url.title + Create [part] from URL + + + + + info_providers.from_url.url.label + URL + + + + + info_providers.from_url.no_part_found + No part found from the given URL. Are you sure this is a valid shop URL? + + + + + info_providers.from_url.help + Creates a part based on the given URL. It tries to delegate it to an existing info provider if possible, otherwise it will be tried to extract rudimentary data from the webpage's metadata. + + From 47c7ee9f07131ecd2b6be11eca8f6e42abc974f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20B=C3=B6hmer?= Date: Sun, 1 Feb 2026 18:24:46 +0100 Subject: [PATCH 11/11] Allow to extract parameters form additionalProperty JSONLD data --- .../Providers/GenericWebProvider.php | 32 ++++++++++++++++--- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php b/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php index d05aac8f..4b73ad6e 100644 --- a/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php +++ b/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php @@ -24,6 +24,7 @@ declare(strict_types=1); namespace App\Services\InfoProviderSystem\Providers; use App\Exceptions\ProviderIDNotSupportedException; +use App\Services\InfoProviderSystem\DTOs\ParameterDTO; use App\Services\InfoProviderSystem\DTOs\PartDetailDTO; use App\Services\InfoProviderSystem\DTOs\PriceDTO; use App\Services\InfoProviderSystem\DTOs\PurchaseInfoDTO; @@ -75,9 +76,9 @@ class GenericWebProvider implements InfoProviderInterface public function searchByKeyword(string $keyword): array { try { - return [ - $this->getDetails($keyword) - ]; } catch (ProviderIDNotSupportedException $e) { + return [ + $this->getDetails($keyword) + ]; } catch (ProviderIDNotSupportedException $e) { return []; } } @@ -143,7 +144,7 @@ class GenericWebProvider implements InfoProviderInterface order_number: (string) ($jsonLd['sku'] ?? $jsonLd['@id'] ?? $jsonLd['gtin'] ?? 'Unknown'), prices: $prices, product_url: $jsonLd['url'] ?? $url, - )]; + )]; } $image = null; @@ -161,6 +162,26 @@ class GenericWebProvider implements InfoProviderInterface $image = $image['contentUrl'] ?? $image['url'] ?? null; } + //Try to extract parameters from additionalProperty + $parameters = []; + if (isset($jsonLd['additionalProperty']) && array_is_list($jsonLd['additionalProperty'])) { + foreach ($jsonLd['additionalProperty'] as $property) { //TODO: Handle minValue and maxValue + if (isset ($property['unitText'])) { + $parameters[] = ParameterDTO::parseValueField( + name: $property['name'] ?? 'Unknown', + value: $property['value'] ?? '', + unit: $property['unitText'] + ); + } else { + $parameters[] = ParameterDTO::parseValueIncludingUnit( + name: $property['name'] ?? 'Unknown', + value: $property['value'] ?? '' + ); + } + } + } + + return new PartDetailDTO( provider_key: $this->getProviderKey(), provider_id: $url, @@ -169,9 +190,10 @@ class GenericWebProvider implements InfoProviderInterface category: isset($jsonLd['category']) && is_string($jsonLd['category']) ? $jsonLd['category'] : null, manufacturer: $jsonLd['manufacturer']['name'] ?? $jsonLd['brand']['name'] ?? null, mpn: $jsonLd['mpn'] ?? null, - preview_image_url: $image, + preview_image_url: $image, provider_url: $url, notes: $notes, + parameters: $parameters, vendor_infos: $vendor_infos, mass: isset($jsonLd['weight']['value']) ? (float)$jsonLd['weight']['value'] : null, );