From 14981200c82777f83c0980dca7e145647e89d200 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20B=C3=B6hmer?= Date: Sun, 1 Feb 2026 14:35:58 +0100 Subject: [PATCH 01/15] Started implementing a generic web provider which uses JSONLD data provided by a webshop page --- .../Providers/GenericWebProvider.php | 177 ++++++++++++++++++ 1 file changed, 177 insertions(+) create mode 100644 src/Services/InfoProviderSystem/Providers/GenericWebProvider.php diff --git a/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php b/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php new file mode 100644 index 00000000..6044e338 --- /dev/null +++ b/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php @@ -0,0 +1,177 @@ +. + */ + +declare(strict_types=1); + + +namespace App\Services\InfoProviderSystem\Providers; + +use App\Services\InfoProviderSystem\DTOs\PartDetailDTO; +use App\Services\InfoProviderSystem\DTOs\PriceDTO; +use App\Services\InfoProviderSystem\DTOs\PurchaseInfoDTO; +use PhpOffice\PhpSpreadsheet\Calculation\Financial\Securities\Price; +use Symfony\Component\DomCrawler\Crawler; +use Symfony\Contracts\HttpClient\HttpClientInterface; + +class GenericWebProvider implements InfoProviderInterface +{ + + public const DISTRIBUTOR_NAME = 'Website'; + + public function __construct(private readonly HttpClientInterface $httpClient) + { + + } + + public function getProviderInfo(): array + { + return [ + 'name' => 'Generic Web URL', + 'description' => 'Tries to extract a part from a given product', + //'url' => 'https://example.com', + 'disabled_help' => 'Enable in settings to use this provider' + ]; + } + + public function getProviderKey(): string + { + return 'generic_web'; + } + + public function isActive(): bool + { + return true; + } + + public function searchByKeyword(string $keyword): array + { + return [ + $this->getDetails($keyword) + ]; + } + + private function extractShopName(string $url): string + { + $host = parse_url($url, PHP_URL_HOST); + if ($host === false || $host === null) { + return self::DISTRIBUTOR_NAME; + } + return $host; + } + + private function productJsonLdToPart(array $jsonLd, string $url): PartDetailDTO + { + $notes = $jsonLd['description'] ?? ""; + if (isset($jsonLd['disambiguatingDescription'])) { + if (!empty($notes)) { + $notes .= "\n\n"; + } + $notes .= $jsonLd['disambiguatingDescription']; + } + + $vendor_infos = null; + if (isset($jsonLd['offers'])) { + $vendor_infos = [new PurchaseInfoDTO( + distributor_name: $this->extractShopName($url), + order_number: $jsonLd['sku'] ?? $jsonLd['@id'] ?? $jsonLd['gtin'] ?? 'Unknown', + prices: [new PriceDTO(minimum_discount_amount: 1, price: (string) $jsonLd['offers']['price'], currency_iso_code: $jsonLd['offers']['priceCurrency'] ?? null)], + product_url: $jsonLd['url'] ?? $url, + )]; + } + + $image = null; + if (isset($jsonLd['image'])) { + if (is_array($jsonLd['image'])) { + $image = $jsonLd['image'][0] ?? null; + } elseif (is_string($jsonLd['image'])) { + $image = $jsonLd['image']; + } + } + + return new PartDetailDTO( + provider_key: $this->getProviderKey(), + provider_id: $url, + name: $jsonLd ['name'] ?? 'Unknown Name', + description: '', + category: isset($jsonLd['category']) && is_string($jsonLd['category']) ? $jsonLd['category'] : null, + manufacturer: $jsonLd['manufacturer']['name'] ?? $jsonLd['brand']['name'] ?? null, + mpn: $jsonLd['mpn'] ?? null, + preview_image_url: $image, + provider_url: $url, + notes: $notes, + vendor_infos: $vendor_infos, + mass: isset($jsonLd['weight']['value']) ? (float)$jsonLd['weight']['value'] : null, + ); + } + + /** + * Decodes JSON in a forgiving way, trying to fix common issues. + * @param string $json + * @return array + * @throws \JsonException + */ + private function json_decode_forgiving(string $json): array + { + //Sanitize common issues + $json = preg_replace("/[\r\n]+/", " ", $json); + return json_decode($json, true, 512, JSON_THROW_ON_ERROR); + } + + public function getDetails(string $id): PartDetailDTO + { + $url = $id; + + //Try to get the webpage content + $response = $this->httpClient->request('GET', $url); + $content = $response->getContent(); + + $dom = new Crawler($content); + + //Try to determine a canonical URL + $canonicalURL = $url; + if ($dom->filter('link[rel="canonical"]')->count() > 0) { + $canonicalURL = $dom->filter('link[rel="canonical"]')->attr('href'); + } else if ($dom->filter('meta[property="og:url"]')->count() > 0) { + $canonicalURL = $dom->filter('meta[property="og:url"]')->attr('content'); + } + + //Try to find json-ld data in the head + $jsonLdNodes = $dom->filter('head script[type="application/ld+json"]'); + foreach ($jsonLdNodes as $node) { + $jsonLd = $this->json_decode_forgiving($node->textContent); + if (isset($jsonLd['@type']) && $jsonLd['@type'] === 'Product') { //If we find a product use that data + return $this->productJsonLdToPart($jsonLd, $canonicalURL); + } + } + + + + return null; + } + + public function getCapabilities(): array + { + return [ + ProviderCapabilities::BASIC, + ProviderCapabilities::PICTURE, + ProviderCapabilities::PRICE + ]; + } +} From b89e878871eaa81ab0293995b9f1b32e7e4029b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20B=C3=B6hmer?= Date: Sun, 1 Feb 2026 16:39:19 +0100 Subject: [PATCH 02/15] Allow to rudimentary parse product pages, even if they do not contain JSON-LD data --- .../Providers/GenericWebProvider.php | 67 ++++++++++++++++--- 1 file changed, 59 insertions(+), 8 deletions(-) diff --git a/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php b/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php index 6044e338..2704ad99 100644 --- a/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php +++ b/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php @@ -35,9 +35,18 @@ class GenericWebProvider implements InfoProviderInterface public const DISTRIBUTOR_NAME = 'Website'; - public function __construct(private readonly HttpClientInterface $httpClient) - { + private readonly HttpClientInterface $httpClient; + public function __construct(HttpClientInterface $httpClient) + { + $this->httpClient = $httpClient->withOptions( + [ + 'headers' => [ + 'User-Agent' => 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/144.0.0.0 Safari/537.36', + ], + 'timeout' => 15, + ] + ); } public function getProviderInfo(): array @@ -76,7 +85,7 @@ class GenericWebProvider implements InfoProviderInterface return $host; } - private function productJsonLdToPart(array $jsonLd, string $url): PartDetailDTO + private function productJsonLdToPart(array $jsonLd, string $url, Crawler $dom): PartDetailDTO { $notes = $jsonLd['description'] ?? ""; if (isset($jsonLd['disambiguatingDescription'])) { @@ -109,7 +118,7 @@ class GenericWebProvider implements InfoProviderInterface provider_key: $this->getProviderKey(), provider_id: $url, name: $jsonLd ['name'] ?? 'Unknown Name', - description: '', + description: $this->getMetaContent($dom, 'og:description') ?? $this->getMetaContent($dom, 'description') ?? '', category: isset($jsonLd['category']) && is_string($jsonLd['category']) ? $jsonLd['category'] : null, manufacturer: $jsonLd['manufacturer']['name'] ?? $jsonLd['brand']['name'] ?? null, mpn: $jsonLd['mpn'] ?? null, @@ -134,6 +143,22 @@ class GenericWebProvider implements InfoProviderInterface return json_decode($json, true, 512, JSON_THROW_ON_ERROR); } + private function getMetaContent(Crawler $dom, string $name): ?string + { + $meta = $dom->filter('meta[property="'.$name.'"]'); + if ($meta->count() > 0) { + return $meta->attr('content'); + } + + //Try name attribute + $meta = $dom->filter('meta[name="'.$name.'"]'); + if ($meta->count() > 0) { + return $meta->attr('content'); + } + + return null; + } + public function getDetails(string $id): PartDetailDTO { $url = $id; @@ -153,17 +178,43 @@ class GenericWebProvider implements InfoProviderInterface } //Try to find json-ld data in the head - $jsonLdNodes = $dom->filter('head script[type="application/ld+json"]'); + $jsonLdNodes = $dom->filter('script[type="application/ld+json"]'); foreach ($jsonLdNodes as $node) { $jsonLd = $this->json_decode_forgiving($node->textContent); if (isset($jsonLd['@type']) && $jsonLd['@type'] === 'Product') { //If we find a product use that data - return $this->productJsonLdToPart($jsonLd, $canonicalURL); + return $this->productJsonLdToPart($jsonLd, $canonicalURL, $dom); } } - + //If no JSON-LD data is found, try to extract basic data from meta tags + $pageTitle = $dom->filter('title')->count() > 0 ? $dom->filter('title')->text() : 'Unknown'; - return null; + $prices = []; + if ($price = $this->getMetaContent($dom, 'product:price:amount')) { + $prices[] = new PriceDTO( + minimum_discount_amount: 1, + price: $price, + currency_iso_code: $this->getMetaContent($dom, 'product:price:currency'), + ); + } + + $vendor_infos = [new PurchaseInfoDTO( + distributor_name: $this->extractShopName($canonicalURL), + order_number: 'Unknown', + prices: $prices, + product_url: $canonicalURL, + )]; + + return new PartDetailDTO( + provider_key: $this->getProviderKey(), + provider_id: $canonicalURL, + name: $this->getMetaContent($dom, 'og:title') ?? $pageTitle, + description: $this->getMetaContent($dom, 'og:description') ?? $this->getMetaContent($dom, 'description') ?? '', + manufacturer: $this->getMetaContent($dom, 'product:brand'), + preview_image_url: $this->getMetaContent($dom, 'og:image'), + provider_url: $canonicalURL, + vendor_infos: $vendor_infos, + ); } public function getCapabilities(): array From 73dbe64a83ad8ae1562bb3d1b137bfabc302ecde Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20B=C3=B6hmer?= Date: Sun, 1 Feb 2026 16:51:26 +0100 Subject: [PATCH 03/15] Allow to extract prices form an Amazon page --- .../Providers/GenericWebProvider.php | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php b/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php index 2704ad99..09d135b8 100644 --- a/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php +++ b/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php @@ -196,6 +196,16 @@ class GenericWebProvider implements InfoProviderInterface price: $price, currency_iso_code: $this->getMetaContent($dom, 'product:price:currency'), ); + } else { + //Amazon fallback + $amazonAmount = $dom->filter('input[type="hidden"][name*="amount"]'); + if ($amazonAmount->count() > 0) { + $prices[] = new PriceDTO( + minimum_discount_amount: 1, + price: $amazonAmount->first()->attr('value'), + currency_iso_code: $dom->filter('input[type="hidden"][name*="currencyCode"]')->first()->attr('value'), + ); + } } $vendor_infos = [new PurchaseInfoDTO( From 52be5481702516f124dce5e75923ad172737e501 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20B=C3=B6hmer?= Date: Sun, 1 Feb 2026 16:55:52 +0100 Subject: [PATCH 04/15] Add https:// if not existing --- .../InfoProviderSystem/Providers/GenericWebProvider.php | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php b/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php index 09d135b8..17457c75 100644 --- a/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php +++ b/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php @@ -161,6 +161,14 @@ class GenericWebProvider implements InfoProviderInterface public function getDetails(string $id): PartDetailDTO { + //Add scheme if missing + if (!preg_match('/^https?:\/\//', $id)) { + //Remove any leading slashes + $id = ltrim($id, '/'); + + $id = 'https://'.$id; + } + $url = $id; //Try to get the webpage content From d868225260316aa303749f1cd72173fe58f2c32c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20B=C3=B6hmer?= Date: Sun, 1 Feb 2026 17:06:38 +0100 Subject: [PATCH 05/15] Properly parse JSONLD product data if it is in an array with others --- .../Providers/GenericWebProvider.php | 21 +++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php b/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php index 17457c75..8b976b50 100644 --- a/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php +++ b/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php @@ -97,10 +97,17 @@ class GenericWebProvider implements InfoProviderInterface $vendor_infos = null; if (isset($jsonLd['offers'])) { + + if (array_is_list($jsonLd['offers'])) { + $offer = $jsonLd['offers'][0]; + } else { + $offer = $jsonLd['offers']; + } + $vendor_infos = [new PurchaseInfoDTO( distributor_name: $this->extractShopName($url), - order_number: $jsonLd['sku'] ?? $jsonLd['@id'] ?? $jsonLd['gtin'] ?? 'Unknown', - prices: [new PriceDTO(minimum_discount_amount: 1, price: (string) $jsonLd['offers']['price'], currency_iso_code: $jsonLd['offers']['priceCurrency'] ?? null)], + order_number: (string) ($jsonLd['sku'] ?? $jsonLd['@id'] ?? $jsonLd['gtin'] ?? 'Unknown'), + prices: [new PriceDTO(minimum_discount_amount: 1, price: (string) $offer['price'], currency_iso_code: $offer['priceCurrency'] ?? null)], product_url: $jsonLd['url'] ?? $url, )]; } @@ -189,8 +196,14 @@ class GenericWebProvider implements InfoProviderInterface $jsonLdNodes = $dom->filter('script[type="application/ld+json"]'); foreach ($jsonLdNodes as $node) { $jsonLd = $this->json_decode_forgiving($node->textContent); - if (isset($jsonLd['@type']) && $jsonLd['@type'] === 'Product') { //If we find a product use that data - return $this->productJsonLdToPart($jsonLd, $canonicalURL, $dom); + //If the content of json-ld is an array, try to find a product inside + if (!array_is_list($jsonLd)) { + $jsonLd = [$jsonLd]; + } + foreach ($jsonLd as $item) { + if (isset($item['@type']) && $item['@type'] === 'Product') { + return $this->productJsonLdToPart($item, $canonicalURL, $dom); + } } } From 1213f82cdf73850d783f2ad56611341293ad8756 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20B=C3=B6hmer?= Date: Sun, 1 Feb 2026 17:11:41 +0100 Subject: [PATCH 06/15] Fix if canonical URL is relative --- .../Providers/GenericWebProvider.php | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php b/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php index 8b976b50..3c657989 100644 --- a/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php +++ b/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php @@ -104,6 +104,14 @@ class GenericWebProvider implements InfoProviderInterface $offer = $jsonLd['offers']; } + //Make $jsonLd['url'] absolute if it's relative + if (isset($jsonLd['url']) && parse_url($jsonLd['url'], PHP_URL_SCHEME) === null) { + $parsedUrl = parse_url($url); + $scheme = $parsedUrl['scheme'] ?? 'https'; + $host = $parsedUrl['host'] ?? ''; + $jsonLd['url'] = $scheme.'://'.$host.$jsonLd['url']; + } + $vendor_infos = [new PurchaseInfoDTO( distributor_name: $this->extractShopName($url), order_number: (string) ($jsonLd['sku'] ?? $jsonLd['@id'] ?? $jsonLd['gtin'] ?? 'Unknown'), @@ -192,6 +200,14 @@ class GenericWebProvider implements InfoProviderInterface $canonicalURL = $dom->filter('meta[property="og:url"]')->attr('content'); } + //If the canonical URL is relative, make it absolute + if (parse_url($canonicalURL, PHP_URL_SCHEME) === null) { + $parsedUrl = parse_url($url); + $scheme = $parsedUrl['scheme'] ?? 'https'; + $host = $parsedUrl['host'] ?? ''; + $canonicalURL = $scheme.'://'.$host.$canonicalURL; + } + //Try to find json-ld data in the head $jsonLdNodes = $dom->filter('script[type="application/ld+json"]'); foreach ($jsonLdNodes as $node) { From 7feba634b8f8195e583b851c669b19a35aa011e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20B=C3=B6hmer?= Date: Sun, 1 Feb 2026 17:20:13 +0100 Subject: [PATCH 07/15] Hadle if offers are nested and images are ImageObjects in JSON+LD --- .../Providers/GenericWebProvider.php | 30 +++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php b/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php index 3c657989..a098c685 100644 --- a/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php +++ b/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php @@ -112,10 +112,30 @@ class GenericWebProvider implements InfoProviderInterface $jsonLd['url'] = $scheme.'://'.$host.$jsonLd['url']; } + $prices = []; + if (isset($offer['price'])) { + $prices[] = new PriceDTO( + minimum_discount_amount: 1, + price: (string) $offer['price'], + currency_iso_code: $offer['priceCurrency'] ?? null + ); + } else if (isset($offer['offers']) && array_is_list($offer['offers'])) { + //Some sites nest offers + foreach ($offer['offers'] as $subOffer) { + if (isset($subOffer['price'])) { + $prices[] = new PriceDTO( + minimum_discount_amount: 1, + price: (string) $subOffer['price'], + currency_iso_code: $subOffer['priceCurrency'] ?? null + ); + } + } + } + $vendor_infos = [new PurchaseInfoDTO( distributor_name: $this->extractShopName($url), order_number: (string) ($jsonLd['sku'] ?? $jsonLd['@id'] ?? $jsonLd['gtin'] ?? 'Unknown'), - prices: [new PriceDTO(minimum_discount_amount: 1, price: (string) $offer['price'], currency_iso_code: $offer['priceCurrency'] ?? null)], + prices: $prices, product_url: $jsonLd['url'] ?? $url, )]; } @@ -123,11 +143,17 @@ class GenericWebProvider implements InfoProviderInterface $image = null; if (isset($jsonLd['image'])) { if (is_array($jsonLd['image'])) { - $image = $jsonLd['image'][0] ?? null; + if (array_is_list($jsonLd['image'])) { + $image = $jsonLd['image'][0] ?? null; + } } elseif (is_string($jsonLd['image'])) { $image = $jsonLd['image']; } } + //If image is an object with @type ImageObject, extract the url + if (is_array($image) && isset($image['@type']) && $image['@type'] === 'ImageObject') { + $image = $image['contentUrl'] ?? $image['url'] ?? null; + } return new PartDetailDTO( provider_key: $this->getProviderKey(), From 071f6f85913ab4a5c496a06e46939ab603c0a77f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20B=C3=B6hmer?= Date: Sun, 1 Feb 2026 17:34:08 +0100 Subject: [PATCH 08/15] Return an empty array if no URL is provider to the Generic Web URL provider --- .../ProviderIDNotSupportedException.php | 32 +++++++++++++++++++ .../Providers/GenericWebProvider.php | 13 +++++++- 2 files changed, 44 insertions(+), 1 deletion(-) create mode 100644 src/Exceptions/ProviderIDNotSupportedException.php diff --git a/src/Exceptions/ProviderIDNotSupportedException.php b/src/Exceptions/ProviderIDNotSupportedException.php new file mode 100644 index 00000000..429f43ea --- /dev/null +++ b/src/Exceptions/ProviderIDNotSupportedException.php @@ -0,0 +1,32 @@ +. + */ + +declare(strict_types=1); + + +namespace App\Exceptions; + +class ProviderIDNotSupportedException extends \RuntimeException +{ + public function fromProvider(string $providerKey, string $id): self + { + return new self(sprintf('The given ID %s is not supported by the provider %s.', $id, $providerKey,)); + } +} diff --git a/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php b/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php index a098c685..248b4f0e 100644 --- a/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php +++ b/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php @@ -23,6 +23,7 @@ declare(strict_types=1); namespace App\Services\InfoProviderSystem\Providers; +use App\Exceptions\ProviderIDNotSupportedException; use App\Services\InfoProviderSystem\DTOs\PartDetailDTO; use App\Services\InfoProviderSystem\DTOs\PriceDTO; use App\Services\InfoProviderSystem\DTOs\PurchaseInfoDTO; @@ -71,9 +72,12 @@ class GenericWebProvider implements InfoProviderInterface public function searchByKeyword(string $keyword): array { + try { return [ $this->getDetails($keyword) - ]; + ]; } catch (ProviderIDNotSupportedException $e) { + return []; + } } private function extractShopName(string $url): string @@ -212,6 +216,13 @@ class GenericWebProvider implements InfoProviderInterface $url = $id; + //If this is not a valid URL with host, domain and path, throw an exception + if (filter_var($url, FILTER_VALIDATE_URL) === false || + parse_url($url, PHP_URL_HOST) === null || + parse_url($url, PHP_URL_PATH) === null) { + throw new ProviderIDNotSupportedException("The given ID is not a valid URL: ".$id); + } + //Try to get the webpage content $response = $this->httpClient->request('GET', $url); $content = $response->getContent(); From 722eb7ddab0e8a00d36845b68eaff82cde2f1f20 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20B=C3=B6hmer?= Date: Sun, 1 Feb 2026 17:47:04 +0100 Subject: [PATCH 09/15] Added settings and docs for the generic Web info provider --- docs/usage/information_provider_system.md | 15 +++++++ .../Providers/GenericWebProvider.php | 10 +++-- .../GenericWebProviderSettings.php | 43 +++++++++++++++++++ .../InfoProviderSettings.php | 3 ++ .../settings/provider_settings.html.twig | 2 +- translations/messages.en.xlf | 18 ++++++++ 6 files changed, 86 insertions(+), 5 deletions(-) create mode 100644 src/Settings/InfoProviderSystem/GenericWebProviderSettings.php diff --git a/docs/usage/information_provider_system.md b/docs/usage/information_provider_system.md index da8ea32b..6cdb5183 100644 --- a/docs/usage/information_provider_system.md +++ b/docs/usage/information_provider_system.md @@ -96,6 +96,21 @@ The following providers are currently available and shipped with Part-DB: (All trademarks are property of their respective owners. Part-DB is not affiliated with any of the companies.) +### Generic Web URL Provider +The Generic Web URL Provider can extract part information from any webpage that contains structured data in the form of +[Schema.org](https://schema.org/) format. Many e-commerce websites use this format to provide detailed product information +for search engines and other services. Therefore it allows Part-DB to retrieve rudimentary part information (like name, image and price) +from a wide range of websites without the need for a dedicated API integration. +To use the Generic Web URL Provider, simply enable it in the information provider settings. No additional configuration +is required. Afterwards you can enter any product URL in the search field, and Part-DB will attempt to extract the relevant part information +from the webpage. + +Please note that if this provider is enabled, Part-DB will make HTTP requests to external websites to fetch product data, which +may have privacy and security implications. + +Following env configuration options are available: +* `PROVIDER_GENERIC_WEB_ENABLED`: Set this to `1` to enable the Generic Web URL Provider (optional, default: `0`) + ### Octopart The Octopart provider uses the [Octopart / Nexar API](https://nexar.com/api) to search for parts and get information. diff --git a/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php b/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php index 248b4f0e..d05aac8f 100644 --- a/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php +++ b/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php @@ -27,6 +27,7 @@ use App\Exceptions\ProviderIDNotSupportedException; use App\Services\InfoProviderSystem\DTOs\PartDetailDTO; use App\Services\InfoProviderSystem\DTOs\PriceDTO; use App\Services\InfoProviderSystem\DTOs\PurchaseInfoDTO; +use App\Settings\InfoProviderSystem\GenericWebProviderSettings; use PhpOffice\PhpSpreadsheet\Calculation\Financial\Securities\Price; use Symfony\Component\DomCrawler\Crawler; use Symfony\Contracts\HttpClient\HttpClientInterface; @@ -38,7 +39,7 @@ class GenericWebProvider implements InfoProviderInterface private readonly HttpClientInterface $httpClient; - public function __construct(HttpClientInterface $httpClient) + public function __construct(HttpClientInterface $httpClient, private readonly GenericWebProviderSettings $settings) { $this->httpClient = $httpClient->withOptions( [ @@ -54,9 +55,10 @@ class GenericWebProvider implements InfoProviderInterface { return [ 'name' => 'Generic Web URL', - 'description' => 'Tries to extract a part from a given product', + 'description' => 'Tries to extract a part from a given product webpage URL using common metadata standards like JSON-LD and OpenGraph.', //'url' => 'https://example.com', - 'disabled_help' => 'Enable in settings to use this provider' + 'disabled_help' => 'Enable in settings to use this provider', + 'settings_class' => GenericWebProviderSettings::class, ]; } @@ -67,7 +69,7 @@ class GenericWebProvider implements InfoProviderInterface public function isActive(): bool { - return true; + return $this->settings->enabled; } public function searchByKeyword(string $keyword): array diff --git a/src/Settings/InfoProviderSystem/GenericWebProviderSettings.php b/src/Settings/InfoProviderSystem/GenericWebProviderSettings.php new file mode 100644 index 00000000..064d8a1c --- /dev/null +++ b/src/Settings/InfoProviderSystem/GenericWebProviderSettings.php @@ -0,0 +1,43 @@ +. + */ + +declare(strict_types=1); + + +namespace App\Settings\InfoProviderSystem; + +use App\Settings\SettingsIcon; +use Jbtronics\SettingsBundle\Metadata\EnvVarMode; +use Jbtronics\SettingsBundle\Settings\Settings; +use Jbtronics\SettingsBundle\Settings\SettingsParameter; +use Jbtronics\SettingsBundle\Settings\SettingsTrait; +use Symfony\Component\Translation\TranslatableMessage as TM; + +#[Settings(label: new TM("settings.ips.generic_web_provider"), description: new TM("settings.ips.generic_web_provider.description"))] +#[SettingsIcon("fa-plug")] +class GenericWebProviderSettings +{ + use SettingsTrait; + + #[SettingsParameter(label: new TM("settings.ips.lcsc.enabled"), description: new TM("settings.ips.generic_web_provider.enabled.help"), + envVar: "bool:PROVIDER_GENERIC_WEB_ENABLED", envVarMode: EnvVarMode::OVERWRITE + )] + public bool $enabled = false; +} diff --git a/src/Settings/InfoProviderSystem/InfoProviderSettings.php b/src/Settings/InfoProviderSystem/InfoProviderSettings.php index fb31bdb9..3e78233f 100644 --- a/src/Settings/InfoProviderSystem/InfoProviderSettings.php +++ b/src/Settings/InfoProviderSystem/InfoProviderSettings.php @@ -37,6 +37,9 @@ class InfoProviderSettings #[EmbeddedSettings] public ?InfoProviderGeneralSettings $general = null; + #[EmbeddedSettings] + public ?GenericWebProviderSettings $genericWebProvider = null; + #[EmbeddedSettings] public ?DigikeySettings $digikey = null; diff --git a/templates/info_providers/settings/provider_settings.html.twig b/templates/info_providers/settings/provider_settings.html.twig index 1876c2eb..86e5bc9b 100644 --- a/templates/info_providers/settings/provider_settings.html.twig +++ b/templates/info_providers/settings/provider_settings.html.twig @@ -10,7 +10,7 @@ {% block card_content %}

- {% if info_provider_info.url %} + {% if info_provider_info.url is defined %} {{ info_provider_info.name }} {% else %} {{ info_provider_info.name }} diff --git a/translations/messages.en.xlf b/translations/messages.en.xlf index b2bd908e..706d629a 100644 --- a/translations/messages.en.xlf +++ b/translations/messages.en.xlf @@ -14316,5 +14316,23 @@ Buerklin-API Authentication server: Only includes attachments in the selected languages in the results. + + + settings.ips.generic_web_provider + Generic Web URL Provider + + + + + settings.ips.generic_web_provider.description + This info provider allows to retrieve basic part information from many shop page URLs. + + + + + settings.ips.generic_web_provider.enabled.help + When the provider is enabled, users can make requests to arbitary websites on behalf of the Part-DB server. Only enable this, if you are aware of the potential consequences. + + From 909cab004431be9c89a76c888bb23614b70f9507 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20B=C3=B6hmer?= Date: Sun, 1 Feb 2026 18:18:58 +0100 Subject: [PATCH 10/15] Added an web page to quickly add a new part from a web URL --- src/Controller/InfoProviderController.php | 56 +++++++++++++++++++ src/Services/Trees/ToolsTreeBuilder.php | 10 ++++ .../GenericWebProviderSettings.php | 2 +- templates/_navbar.html.twig | 16 ++++-- .../from_url/from_url.html.twig | 21 +++++++ translations/messages.en.xlf | 24 ++++++++ 6 files changed, 124 insertions(+), 5 deletions(-) create mode 100644 templates/info_providers/from_url/from_url.html.twig diff --git a/src/Controller/InfoProviderController.php b/src/Controller/InfoProviderController.php index e5a5d87b..deec8a57 100644 --- a/src/Controller/InfoProviderController.php +++ b/src/Controller/InfoProviderController.php @@ -30,6 +30,7 @@ use App\Form\InfoProviderSystem\PartSearchType; use App\Services\InfoProviderSystem\ExistingPartFinder; use App\Services\InfoProviderSystem\PartInfoRetriever; use App\Services\InfoProviderSystem\ProviderRegistry; +use App\Services\InfoProviderSystem\Providers\GenericWebProvider; use App\Settings\AppSettings; use App\Settings\InfoProviderSystem\InfoProviderGeneralSettings; use Doctrine\ORM\EntityManagerInterface; @@ -39,6 +40,7 @@ use Psr\Log\LoggerInterface; use Symfony\Bridge\Doctrine\Attribute\MapEntity; use Symfony\Bundle\FrameworkBundle\Controller\AbstractController; use Symfony\Component\Form\Extension\Core\Type\SubmitType; +use Symfony\Component\Form\Extension\Core\Type\UrlType; use Symfony\Component\HttpClient\Exception\ClientException; use Symfony\Component\HttpClient\Exception\TransportException; use Symfony\Component\HttpFoundation\Request; @@ -208,4 +210,58 @@ class InfoProviderController extends AbstractController 'update_target' => $update_target ]); } + + #[Route('/from_url', name: 'info_providers_from_url')] + public function fromURL(Request $request, GenericWebProvider $provider): Response + { + $this->denyAccessUnlessGranted('@info_providers.create_parts'); + + if (!$provider->isActive()) { + $this->addFlash('error', "Generic Web Provider is not active. Please enable it in the provider settings."); + return $this->redirectToRoute('info_providers_list'); + } + + $formBuilder = $this->createFormBuilder(); + $formBuilder->add('url', UrlType::class, [ + 'label' => 'info_providers.from_url.url.label', + 'required' => true, + ]); + $formBuilder->add('submit', SubmitType::class, [ + 'label' => 'info_providers.search.submit', + ]); + + $form = $formBuilder->getForm(); + $form->handleRequest($request); + + $partDetail = null; + if ($form->isSubmitted() && $form->isValid()) { + //Try to retrieve the part detail from the given URL + $url = $form->get('url')->getData(); + try { + $searchResult = $this->infoRetriever->searchByKeyword( + keyword: $url, + providers: [$provider] + ); + + if (count($searchResult) === 0) { + $this->addFlash('warning', t('info_providers.from_url.no_part_found')); + } else { + $searchResult = $searchResult[0]; + //Redirect to the part creation page with the found part detail + return $this->redirectToRoute('info_providers_create_part', [ + 'providerKey' => $searchResult->provider_key, + 'providerId' => $searchResult->provider_id, + ]); + } + } catch (ExceptionInterface $e) { + $this->addFlash('error', t('info_providers.search.error.general_exception', ['%type%' => (new \ReflectionClass($e))->getShortName()])); + } + } + + return $this->render('info_providers/from_url/from_url.html.twig', [ + 'form' => $form, + 'partDetail' => $partDetail, + ]); + + } } diff --git a/src/Services/Trees/ToolsTreeBuilder.php b/src/Services/Trees/ToolsTreeBuilder.php index 37a09b09..c8afac12 100644 --- a/src/Services/Trees/ToolsTreeBuilder.php +++ b/src/Services/Trees/ToolsTreeBuilder.php @@ -39,6 +39,8 @@ use App\Entity\UserSystem\User; use App\Helpers\Trees\TreeViewNode; use App\Services\Cache\UserCacheKeyGenerator; use App\Services\ElementTypeNameGenerator; +use App\Services\InfoProviderSystem\Providers\GenericWebProvider; +use App\Settings\InfoProviderSystem\GenericWebProviderSettings; use Symfony\Bundle\SecurityBundle\Security; use Symfony\Component\Routing\Generator\UrlGeneratorInterface; use Symfony\Contracts\Cache\ItemInterface; @@ -58,6 +60,7 @@ class ToolsTreeBuilder protected UserCacheKeyGenerator $keyGenerator, protected Security $security, private readonly ElementTypeNameGenerator $elementTypeNameGenerator, + private readonly GenericWebProviderSettings $genericWebProviderSettings ) { } @@ -147,6 +150,13 @@ class ToolsTreeBuilder $this->urlGenerator->generate('info_providers_search') ))->setIcon('fa-treeview fa-fw fa-solid fa-cloud-arrow-down'); + if ($this->genericWebProviderSettings->enabled) { + $nodes[] = (new TreeViewNode( + $this->translator->trans('info_providers.from_url.title'), + $this->urlGenerator->generate('info_providers_from_url') + ))->setIcon('fa-treeview fa-fw fa-solid fa-book-atlas'); + } + $nodes[] = (new TreeViewNode( $this->translator->trans('info_providers.bulk_import.manage_jobs'), $this->urlGenerator->generate('bulk_info_provider_manage') diff --git a/src/Settings/InfoProviderSystem/GenericWebProviderSettings.php b/src/Settings/InfoProviderSystem/GenericWebProviderSettings.php index 064d8a1c..07972141 100644 --- a/src/Settings/InfoProviderSystem/GenericWebProviderSettings.php +++ b/src/Settings/InfoProviderSystem/GenericWebProviderSettings.php @@ -30,7 +30,7 @@ use Jbtronics\SettingsBundle\Settings\SettingsParameter; use Jbtronics\SettingsBundle\Settings\SettingsTrait; use Symfony\Component\Translation\TranslatableMessage as TM; -#[Settings(label: new TM("settings.ips.generic_web_provider"), description: new TM("settings.ips.generic_web_provider.description"))] +#[Settings(name: "generic_web_provider", label: new TM("settings.ips.generic_web_provider"), description: new TM("settings.ips.generic_web_provider.description"))] #[SettingsIcon("fa-plug")] class GenericWebProviderSettings { diff --git a/templates/_navbar.html.twig b/templates/_navbar.html.twig index 446ccdab..c4dfbe0f 100644 --- a/templates/_navbar.html.twig +++ b/templates/_navbar.html.twig @@ -10,9 +10,9 @@ - {% if is_granted("@tools.label_scanner") %} + {% if is_granted("@tools.label_scanner") %} - + {% endif %}

@@ -52,6 +52,14 @@ {% trans %}info_providers.search.title{% endtrans %} + {% if settings_instance('generic_web_provider').enabled %} +
  • + + + {% trans %}info_providers.from_url.title{% endtrans %} + +
  • + {% endif %} {% endif %} {% if is_granted('@parts.import') %} @@ -69,7 +77,7 @@ {% if is_granted('@parts.read') %} {{ search.search_form("navbar") }} - {# {% include "_navbar_search.html.twig" %} #} + {# {% include "_navbar_search.html.twig" %} #} {% endif %} @@ -145,4 +153,4 @@ - \ No newline at end of file + diff --git a/templates/info_providers/from_url/from_url.html.twig b/templates/info_providers/from_url/from_url.html.twig new file mode 100644 index 00000000..5aad1a03 --- /dev/null +++ b/templates/info_providers/from_url/from_url.html.twig @@ -0,0 +1,21 @@ +{% extends "main_card.html.twig" %} + +{% import "info_providers/providers.macro.html.twig" as providers_macro %} +{% import "helper.twig" as helper %} + +{% block title %} + {% trans %}info_providers.from_url.title{% endtrans %} +{% endblock %} + +{% block card_title %} + {% trans %}info_providers.from_url.title{% endtrans %} +{% endblock %} + +{% block card_content %} +

    {% trans %}info_providers.from_url.help{% endtrans %}

    + + {{ form_start(form) }} + {{ form_row(form.url) }} + {{ form_row(form.submit) }} + {{ form_end(form) }} +{% endblock %} diff --git a/translations/messages.en.xlf b/translations/messages.en.xlf index 706d629a..87f6c2f6 100644 --- a/translations/messages.en.xlf +++ b/translations/messages.en.xlf @@ -14334,5 +14334,29 @@ Buerklin-API Authentication server: When the provider is enabled, users can make requests to arbitary websites on behalf of the Part-DB server. Only enable this, if you are aware of the potential consequences. + + + info_providers.from_url.title + Create [part] from URL + + + + + info_providers.from_url.url.label + URL + + + + + info_providers.from_url.no_part_found + No part found from the given URL. Are you sure this is a valid shop URL? + + + + + info_providers.from_url.help + Creates a part based on the given URL. It tries to delegate it to an existing info provider if possible, otherwise it will be tried to extract rudimentary data from the webpage's metadata. + + From 47c7ee9f07131ecd2b6be11eca8f6e42abc974f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20B=C3=B6hmer?= Date: Sun, 1 Feb 2026 18:24:46 +0100 Subject: [PATCH 11/15] Allow to extract parameters form additionalProperty JSONLD data --- .../Providers/GenericWebProvider.php | 32 ++++++++++++++++--- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php b/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php index d05aac8f..4b73ad6e 100644 --- a/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php +++ b/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php @@ -24,6 +24,7 @@ declare(strict_types=1); namespace App\Services\InfoProviderSystem\Providers; use App\Exceptions\ProviderIDNotSupportedException; +use App\Services\InfoProviderSystem\DTOs\ParameterDTO; use App\Services\InfoProviderSystem\DTOs\PartDetailDTO; use App\Services\InfoProviderSystem\DTOs\PriceDTO; use App\Services\InfoProviderSystem\DTOs\PurchaseInfoDTO; @@ -75,9 +76,9 @@ class GenericWebProvider implements InfoProviderInterface public function searchByKeyword(string $keyword): array { try { - return [ - $this->getDetails($keyword) - ]; } catch (ProviderIDNotSupportedException $e) { + return [ + $this->getDetails($keyword) + ]; } catch (ProviderIDNotSupportedException $e) { return []; } } @@ -143,7 +144,7 @@ class GenericWebProvider implements InfoProviderInterface order_number: (string) ($jsonLd['sku'] ?? $jsonLd['@id'] ?? $jsonLd['gtin'] ?? 'Unknown'), prices: $prices, product_url: $jsonLd['url'] ?? $url, - )]; + )]; } $image = null; @@ -161,6 +162,26 @@ class GenericWebProvider implements InfoProviderInterface $image = $image['contentUrl'] ?? $image['url'] ?? null; } + //Try to extract parameters from additionalProperty + $parameters = []; + if (isset($jsonLd['additionalProperty']) && array_is_list($jsonLd['additionalProperty'])) { + foreach ($jsonLd['additionalProperty'] as $property) { //TODO: Handle minValue and maxValue + if (isset ($property['unitText'])) { + $parameters[] = ParameterDTO::parseValueField( + name: $property['name'] ?? 'Unknown', + value: $property['value'] ?? '', + unit: $property['unitText'] + ); + } else { + $parameters[] = ParameterDTO::parseValueIncludingUnit( + name: $property['name'] ?? 'Unknown', + value: $property['value'] ?? '' + ); + } + } + } + + return new PartDetailDTO( provider_key: $this->getProviderKey(), provider_id: $url, @@ -169,9 +190,10 @@ class GenericWebProvider implements InfoProviderInterface category: isset($jsonLd['category']) && is_string($jsonLd['category']) ? $jsonLd['category'] : null, manufacturer: $jsonLd['manufacturer']['name'] ?? $jsonLd['brand']['name'] ?? null, mpn: $jsonLd['mpn'] ?? null, - preview_image_url: $image, + preview_image_url: $image, provider_url: $url, notes: $notes, + parameters: $parameters, vendor_infos: $vendor_infos, mass: isset($jsonLd['weight']['value']) ? (float)$jsonLd['weight']['value'] : null, ); From 10acc2e1300cad57766ba82c7e0ad9042817b232 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20B=C3=B6hmer?= Date: Sun, 1 Feb 2026 20:49:50 +0100 Subject: [PATCH 12/15] Added logic to delegate the info retrieval logic to another provider when giving an URL --- .../InfoProviderSystem/ProviderRegistry.php | 38 +++++++++++++++- .../Providers/GenericWebProvider.php | 44 ++++++++++++++++++- .../Providers/PollinProvider.php | 37 +++++++++++++--- .../URLHandlerInfoProviderInterface.php | 43 ++++++++++++++++++ .../ProviderRegistryTest.php | 18 +++++++- 5 files changed, 169 insertions(+), 11 deletions(-) create mode 100644 src/Services/InfoProviderSystem/Providers/URLHandlerInfoProviderInterface.php diff --git a/src/Services/InfoProviderSystem/ProviderRegistry.php b/src/Services/InfoProviderSystem/ProviderRegistry.php index f6c398d2..18b8a37a 100644 --- a/src/Services/InfoProviderSystem/ProviderRegistry.php +++ b/src/Services/InfoProviderSystem/ProviderRegistry.php @@ -24,6 +24,7 @@ declare(strict_types=1); namespace App\Services\InfoProviderSystem; use App\Services\InfoProviderSystem\Providers\InfoProviderInterface; +use App\Services\InfoProviderSystem\Providers\URLHandlerInfoProviderInterface; /** * This class keeps track of all registered info providers and allows to find them by their key @@ -47,6 +48,8 @@ final class ProviderRegistry */ private array $providers_disabled = []; + private array $providers_by_domain = []; + /** * @var bool Whether the registry has been initialized */ @@ -78,6 +81,14 @@ final class ProviderRegistry $this->providers_by_name[$key] = $provider; if ($provider->isActive()) { $this->providers_active[$key] = $provider; + if ($provider instanceof URLHandlerInfoProviderInterface) { + foreach ($provider->getHandledDomains() as $domain) { + if (isset($this->providers_by_domain[$domain])) { + throw new \LogicException("Domain $domain is already handled by another provider"); + } + $this->providers_by_domain[$domain] = $provider; + } + } } else { $this->providers_disabled[$key] = $provider; } @@ -139,4 +150,29 @@ final class ProviderRegistry return $this->providers_disabled; } -} \ No newline at end of file + + public function getProviderHandlingDomain(string $domain): (InfoProviderInterface&URLHandlerInfoProviderInterface)|null + { + if (!$this->initialized) { + $this->initStructures(); + } + + //Check if the domain is directly existing: + if (isset($this->providers_by_domain[$domain])) { + return $this->providers_by_domain[$domain]; + } + + //Otherwise check for subdomains: + $parts = explode('.', $domain); + while (count($parts) > 2) { + array_shift($parts); + $check_domain = implode('.', $parts); + if (isset($this->providers_by_domain[$check_domain])) { + return $this->providers_by_domain[$check_domain]; + } + } + + //If we found nothing, return null + return null; + } +} diff --git a/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php b/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php index 4b73ad6e..bca3d7cb 100644 --- a/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php +++ b/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php @@ -28,8 +28,9 @@ use App\Services\InfoProviderSystem\DTOs\ParameterDTO; use App\Services\InfoProviderSystem\DTOs\PartDetailDTO; use App\Services\InfoProviderSystem\DTOs\PriceDTO; use App\Services\InfoProviderSystem\DTOs\PurchaseInfoDTO; +use App\Services\InfoProviderSystem\PartInfoRetriever; +use App\Services\InfoProviderSystem\ProviderRegistry; use App\Settings\InfoProviderSystem\GenericWebProviderSettings; -use PhpOffice\PhpSpreadsheet\Calculation\Financial\Securities\Price; use Symfony\Component\DomCrawler\Crawler; use Symfony\Contracts\HttpClient\HttpClientInterface; @@ -40,7 +41,9 @@ class GenericWebProvider implements InfoProviderInterface private readonly HttpClientInterface $httpClient; - public function __construct(HttpClientInterface $httpClient, private readonly GenericWebProviderSettings $settings) + public function __construct(HttpClientInterface $httpClient, private readonly GenericWebProviderSettings $settings, + private readonly ProviderRegistry $providerRegistry, private readonly PartInfoRetriever $infoRetriever, + ) { $this->httpClient = $httpClient->withOptions( [ @@ -228,6 +231,37 @@ class GenericWebProvider implements InfoProviderInterface return null; } + /** + * Delegates the URL to another provider if possible, otherwise return null + * @param string $url + * @return PartDetailDTO|null + */ + private function delegateToOtherProvider(string $url): ?PartDetailDTO + { + //Extract domain from url: + $host = parse_url($url, PHP_URL_HOST); + if ($host === false || $host === null) { + return null; + } + + $provider = $this->providerRegistry->getProviderHandlingDomain($host); + + if ($provider !== null && $provider->isActive() && $provider->getProviderKey() !== $this->getProviderKey()) { + try { + $id = $provider->getIDFromURL($url); + if ($id !== null) { + return $this->infoRetriever->getDetails($provider->getProviderKey(), $id); + } + return null; + } catch (ProviderIDNotSupportedException $e) { + //Ignore and continue + return null; + } + } + + return null; + } + public function getDetails(string $id): PartDetailDTO { //Add scheme if missing @@ -247,6 +281,12 @@ class GenericWebProvider implements InfoProviderInterface throw new ProviderIDNotSupportedException("The given ID is not a valid URL: ".$id); } + //Before loading the page, try to delegate to another provider + $delegatedPart = $this->delegateToOtherProvider($url); + if ($delegatedPart !== null) { + return $delegatedPart; + } + //Try to get the webpage content $response = $this->httpClient->request('GET', $url); $content = $response->getContent(); diff --git a/src/Services/InfoProviderSystem/Providers/PollinProvider.php b/src/Services/InfoProviderSystem/Providers/PollinProvider.php index 2c5d68a3..6ac969d3 100644 --- a/src/Services/InfoProviderSystem/Providers/PollinProvider.php +++ b/src/Services/InfoProviderSystem/Providers/PollinProvider.php @@ -36,7 +36,7 @@ use Symfony\Component\DependencyInjection\Attribute\Autowire; use Symfony\Component\DomCrawler\Crawler; use Symfony\Contracts\HttpClient\HttpClientInterface; -class PollinProvider implements InfoProviderInterface +class PollinProvider implements InfoProviderInterface, URLHandlerInfoProviderInterface { public function __construct(private readonly HttpClientInterface $client, @@ -141,11 +141,16 @@ class PollinProvider implements InfoProviderInterface $orderId = trim($dom->filter('span[itemprop="sku"]')->text()); //Text is important here //Calculate the mass - $massStr = $dom->filter('meta[itemprop="weight"]')->attr('content'); - //Remove the unit - $massStr = str_replace('kg', '', $massStr); - //Convert to float and convert to grams - $mass = (float) $massStr * 1000; + $massDom = $dom->filter('meta[itemprop="weight"]'); + if ($massDom->count() > 0) { + $massStr = $massDom->attr('content'); + $massStr = str_replace('kg', '', $massStr); + //Convert to float and convert to grams + $mass = (float) $massStr * 1000; + } else { + $mass = null; + } + //Parse purchase info $purchaseInfo = new PurchaseInfoDTO('Pollin', $orderId, $this->parsePrices($dom), $productPageUrl); @@ -248,4 +253,22 @@ class PollinProvider implements InfoProviderInterface ProviderCapabilities::DATASHEET ]; } -} \ No newline at end of file + + public function getHandledDomains(): array + { + return ['pollin.de']; + } + + public function getIDFromURL(string $url): ?string + { + //URL like: https://www.pollin.de/p/shelly-bluetooth-schalter-und-dimmer-blu-zb-button-plug-play-mocha-592325 + + //Extract the 6-digit number at the end of the URL + $matches = []; + if (preg_match('/-(\d{6})(?:\/|$)/', $url, $matches)) { + return $matches[1]; + } + + return null; + } +} diff --git a/src/Services/InfoProviderSystem/Providers/URLHandlerInfoProviderInterface.php b/src/Services/InfoProviderSystem/Providers/URLHandlerInfoProviderInterface.php new file mode 100644 index 00000000..c0506648 --- /dev/null +++ b/src/Services/InfoProviderSystem/Providers/URLHandlerInfoProviderInterface.php @@ -0,0 +1,43 @@ +. + */ + +declare(strict_types=1); + + +namespace App\Services\InfoProviderSystem\Providers; + +/** + * If an interface + */ +interface URLHandlerInfoProviderInterface +{ + /** + * Returns a list of supported domains (e.g. ["digikey.com"]) + * @return array An array of supported domains + */ + public function getHandledDomains(): array; + + /** + * Extracts the unique ID of a part from a given URL. It is okay if this is not a canonical ID, as long as it can be used to uniquely identify the part within this provider. + * @param string $url The URL to extract the ID from + * @return string|null The extracted ID, or null if the URL is not valid for this provider + */ + public function getIDFromURL(string $url): ?string; +} diff --git a/tests/Services/InfoProviderSystem/ProviderRegistryTest.php b/tests/Services/InfoProviderSystem/ProviderRegistryTest.php index 9026c5bf..48a1847f 100644 --- a/tests/Services/InfoProviderSystem/ProviderRegistryTest.php +++ b/tests/Services/InfoProviderSystem/ProviderRegistryTest.php @@ -24,6 +24,7 @@ namespace App\Tests\Services\InfoProviderSystem; use App\Services\InfoProviderSystem\ProviderRegistry; use App\Services\InfoProviderSystem\Providers\InfoProviderInterface; +use App\Services\InfoProviderSystem\Providers\URLHandlerInfoProviderInterface; use PHPUnit\Framework\TestCase; class ProviderRegistryTest extends TestCase @@ -44,9 +45,10 @@ class ProviderRegistryTest extends TestCase public function getMockProvider(string $key, bool $active = true): InfoProviderInterface { - $mock = $this->createMock(InfoProviderInterface::class); + $mock = $this->createMockForIntersectionOfInterfaces([InfoProviderInterface::class, URLHandlerInfoProviderInterface::class]); $mock->method('getProviderKey')->willReturn($key); $mock->method('isActive')->willReturn($active); + $mock->method('getHandledDomains')->willReturn(["$key.com", "test.$key.de"]); return $mock; } @@ -109,4 +111,18 @@ class ProviderRegistryTest extends TestCase $registry->getProviders(); } + + public function testGetProviderHandlingDomain(): void + { + $registry = new ProviderRegistry($this->providers); + + $this->assertEquals($this->providers[0], $registry->getProviderHandlingDomain('test1.com')); + $this->assertEquals($this->providers[0], $registry->getProviderHandlingDomain('www.test1.com')); //Subdomain should also work + + $this->assertEquals( + $this->providers[1], + $registry->getProviderHandlingDomain('test.test2.de') + ); + } + } From 24f0f0d23c325a23ab4d7d000b90e36e1fd80975 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20B=C3=B6hmer?= Date: Sun, 1 Feb 2026 21:18:06 +0100 Subject: [PATCH 13/15] Added URL handling to a few more existing info providers --- .../Providers/ConradProvider.php | 25 ++++++++- .../Providers/Element14Provider.php | 19 ++++++- .../Providers/GenericWebProvider.php | 51 +++++++++++++------ .../Providers/LCSCProvider.php | 19 ++++++- .../Providers/TMEProvider.php | 20 +++++++- 5 files changed, 115 insertions(+), 19 deletions(-) diff --git a/src/Services/InfoProviderSystem/Providers/ConradProvider.php b/src/Services/InfoProviderSystem/Providers/ConradProvider.php index 6212f148..32434dee 100644 --- a/src/Services/InfoProviderSystem/Providers/ConradProvider.php +++ b/src/Services/InfoProviderSystem/Providers/ConradProvider.php @@ -30,9 +30,10 @@ use App\Services\InfoProviderSystem\DTOs\PriceDTO; use App\Services\InfoProviderSystem\DTOs\PurchaseInfoDTO; use App\Services\InfoProviderSystem\DTOs\SearchResultDTO; use App\Settings\InfoProviderSystem\ConradSettings; +use App\Settings\InfoProviderSystem\ConradShopIDs; use Symfony\Contracts\HttpClient\HttpClientInterface; -readonly class ConradProvider implements InfoProviderInterface +readonly class ConradProvider implements InfoProviderInterface, URLHandlerInfoProviderInterface { private const SEARCH_ENDPOINT = '/search/1/v3/facetSearch'; @@ -317,4 +318,26 @@ readonly class ConradProvider implements InfoProviderInterface ProviderCapabilities::PRICE, ]; } + + public function getHandledDomains(): array + { + $domains = []; + foreach (ConradShopIDs::cases() as $shopID) { + $domains[] = $shopID->getDomain(); + } + return array_unique($domains); + } + + public function getIDFromURL(string $url): ?string + { + //Input: https://www.conrad.de/de/p/apple-iphone-air-wolkenweiss-256-gb-eek-a-a-g-16-5-cm-6-5-zoll-3475299.html + //The numbers before the optional .html are the product ID + + $matches = []; + if (preg_match('/-(\d+)(\.html)?$/', $url, $matches) === 1) { + return $matches[1]; + } + + return null; + } } diff --git a/src/Services/InfoProviderSystem/Providers/Element14Provider.php b/src/Services/InfoProviderSystem/Providers/Element14Provider.php index 27dfb908..9ae45728 100644 --- a/src/Services/InfoProviderSystem/Providers/Element14Provider.php +++ b/src/Services/InfoProviderSystem/Providers/Element14Provider.php @@ -33,7 +33,7 @@ use App\Settings\InfoProviderSystem\Element14Settings; use Composer\CaBundle\CaBundle; use Symfony\Contracts\HttpClient\HttpClientInterface; -class Element14Provider implements InfoProviderInterface +class Element14Provider implements InfoProviderInterface, URLHandlerInfoProviderInterface { private const ENDPOINT_URL = 'https://api.element14.com/catalog/products'; @@ -309,4 +309,21 @@ class Element14Provider implements InfoProviderInterface ProviderCapabilities::DATASHEET, ]; } + + public function getHandledDomains(): array + { + return ['element14.com', 'farnell.com', 'newark.com']; + } + + public function getIDFromURL(string $url): ?string + { + //Input URL example: https://de.farnell.com/on-semiconductor/bc547b/transistor-npn-to-92/dp/1017673 + //The digits after the /dp/ are the part ID + $matches = []; + if (preg_match('#/dp/(\d+)#', $url, $matches) === 1) { + return $matches[1]; + } + + return null; + } } diff --git a/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php b/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php index bca3d7cb..d06a6105 100644 --- a/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php +++ b/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php @@ -28,6 +28,7 @@ use App\Services\InfoProviderSystem\DTOs\ParameterDTO; use App\Services\InfoProviderSystem\DTOs\PartDetailDTO; use App\Services\InfoProviderSystem\DTOs\PriceDTO; use App\Services\InfoProviderSystem\DTOs\PurchaseInfoDTO; +use App\Services\InfoProviderSystem\DTOs\SearchResultDTO; use App\Services\InfoProviderSystem\PartInfoRetriever; use App\Services\InfoProviderSystem\ProviderRegistry; use App\Settings\InfoProviderSystem\GenericWebProviderSettings; @@ -78,9 +79,17 @@ class GenericWebProvider implements InfoProviderInterface public function searchByKeyword(string $keyword): array { + $url = $this->fixAndValidateURL($keyword); + + //Before loading the page, try to delegate to another provider + $delegatedPart = $this->delegateToOtherProvider($url); + if ($delegatedPart !== null) { + return [$delegatedPart]; + } + try { return [ - $this->getDetails($keyword) + $this->getDetails($keyword, false) //We already tried delegation ]; } catch (ProviderIDNotSupportedException $e) { return []; } @@ -234,9 +243,9 @@ class GenericWebProvider implements InfoProviderInterface /** * Delegates the URL to another provider if possible, otherwise return null * @param string $url - * @return PartDetailDTO|null + * @return SearchResultDTO|null */ - private function delegateToOtherProvider(string $url): ?PartDetailDTO + private function delegateToOtherProvider(string $url): ?SearchResultDTO { //Extract domain from url: $host = parse_url($url, PHP_URL_HOST); @@ -250,7 +259,10 @@ class GenericWebProvider implements InfoProviderInterface try { $id = $provider->getIDFromURL($url); if ($id !== null) { - return $this->infoRetriever->getDetails($provider->getProviderKey(), $id); + $results = $this->infoRetriever->searchByKeyword($id, [$provider]); + if (count($results) > 0) { + return $results[0]; + } } return null; } catch (ProviderIDNotSupportedException $e) { @@ -262,29 +274,38 @@ class GenericWebProvider implements InfoProviderInterface return null; } - public function getDetails(string $id): PartDetailDTO + private function fixAndValidateURL(string $url): string { + $originalUrl = $url; + //Add scheme if missing - if (!preg_match('/^https?:\/\//', $id)) { + if (!preg_match('/^https?:\/\//', $url)) { //Remove any leading slashes - $id = ltrim($id, '/'); + $url = ltrim($url, '/'); - $id = 'https://'.$id; + $url = 'https://'.$url; } - $url = $id; - //If this is not a valid URL with host, domain and path, throw an exception if (filter_var($url, FILTER_VALIDATE_URL) === false || parse_url($url, PHP_URL_HOST) === null || parse_url($url, PHP_URL_PATH) === null) { - throw new ProviderIDNotSupportedException("The given ID is not a valid URL: ".$id); + throw new ProviderIDNotSupportedException("The given ID is not a valid URL: ".$originalUrl); } - //Before loading the page, try to delegate to another provider - $delegatedPart = $this->delegateToOtherProvider($url); - if ($delegatedPart !== null) { - return $delegatedPart; + return $url; + } + + public function getDetails(string $id, bool $check_for_delegation = true): PartDetailDTO + { + $url = $this->fixAndValidateURL($id); + + if ($check_for_delegation) { + //Before loading the page, try to delegate to another provider + $delegatedPart = $this->delegateToOtherProvider($url); + if ($delegatedPart !== null) { + return $delegatedPart; + } } //Try to get the webpage content diff --git a/src/Services/InfoProviderSystem/Providers/LCSCProvider.php b/src/Services/InfoProviderSystem/Providers/LCSCProvider.php index ede34eb8..1b807eff 100755 --- a/src/Services/InfoProviderSystem/Providers/LCSCProvider.php +++ b/src/Services/InfoProviderSystem/Providers/LCSCProvider.php @@ -33,7 +33,7 @@ use App\Settings\InfoProviderSystem\LCSCSettings; use Symfony\Component\HttpFoundation\Cookie; use Symfony\Contracts\HttpClient\HttpClientInterface; -class LCSCProvider implements BatchInfoProviderInterface +class LCSCProvider implements BatchInfoProviderInterface, URLHandlerInfoProviderInterface { private const ENDPOINT_URL = 'https://wmsc.lcsc.com/ftps/wm'; @@ -452,4 +452,21 @@ class LCSCProvider implements BatchInfoProviderInterface ProviderCapabilities::FOOTPRINT, ]; } + + public function getHandledDomains(): array + { + return ['lcsc.com']; + } + + public function getIDFromURL(string $url): ?string + { + //Input example: https://www.lcsc.com/product-detail/C258144.html?s_z=n_BC547 + //The part between the "C" and the ".html" is the unique ID + + $matches = []; + if (preg_match("#/product-detail/(\w+)\.html#", $url, $matches) > 0) { + return $matches[1]; + } + return null; + } } diff --git a/src/Services/InfoProviderSystem/Providers/TMEProvider.php b/src/Services/InfoProviderSystem/Providers/TMEProvider.php index 9bc73f09..938bc7b3 100644 --- a/src/Services/InfoProviderSystem/Providers/TMEProvider.php +++ b/src/Services/InfoProviderSystem/Providers/TMEProvider.php @@ -32,7 +32,7 @@ use App\Services\InfoProviderSystem\DTOs\PurchaseInfoDTO; use App\Services\InfoProviderSystem\DTOs\SearchResultDTO; use App\Settings\InfoProviderSystem\TMESettings; -class TMEProvider implements InfoProviderInterface +class TMEProvider implements InfoProviderInterface, URLHandlerInfoProviderInterface { private const VENDOR_NAME = 'TME'; @@ -296,4 +296,22 @@ class TMEProvider implements InfoProviderInterface ProviderCapabilities::PRICE, ]; } + + public function getHandledDomains(): array + { + return ['tme.eu']; + } + + public function getIDFromURL(string $url): ?string + { + //Input: https://www.tme.eu/de/details/fi321_se/kuhler/alutronic/ + //The ID is the part after the details segment and before the next slash + + $matches = []; + if (preg_match('#/details/([^/]+)/#', $url, $matches) === 1) { + return $matches[1]; + } + + return null; + } } From a1396c6696f4a9b6421a400b529a42176b3ea9cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20B=C3=B6hmer?= Date: Sun, 1 Feb 2026 21:19:11 +0100 Subject: [PATCH 14/15] Fixed delegation logic for PartDetailDTO --- .../InfoProviderSystem/Providers/GenericWebProvider.php | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php b/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php index d06a6105..66d45707 100644 --- a/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php +++ b/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php @@ -224,6 +224,12 @@ class GenericWebProvider implements InfoProviderInterface return json_decode($json, true, 512, JSON_THROW_ON_ERROR); } + /** + * Gets the content of a meta tag by its name or property attribute, or null if not found + * @param Crawler $dom + * @param string $name + * @return string|null + */ private function getMetaContent(Crawler $dom, string $name): ?string { $meta = $dom->filter('meta[property="'.$name.'"]'); @@ -304,7 +310,7 @@ class GenericWebProvider implements InfoProviderInterface //Before loading the page, try to delegate to another provider $delegatedPart = $this->delegateToOtherProvider($url); if ($delegatedPart !== null) { - return $delegatedPart; + return $this->infoRetriever->getDetailsForSearchResult($delegatedPart); } } From 0826acbd5282fad8361117660a1f762060aa690b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20B=C3=B6hmer?= Date: Sun, 1 Feb 2026 23:11:56 +0100 Subject: [PATCH 15/15] Fixed phpunit tests --- .../LabelSystem/BarcodeScanner/BarcodeRedirectorTest.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/Services/LabelSystem/BarcodeScanner/BarcodeRedirectorTest.php b/tests/Services/LabelSystem/BarcodeScanner/BarcodeRedirectorTest.php index c40e141d..c5bdb02d 100644 --- a/tests/Services/LabelSystem/BarcodeScanner/BarcodeRedirectorTest.php +++ b/tests/Services/LabelSystem/BarcodeScanner/BarcodeRedirectorTest.php @@ -64,7 +64,7 @@ final class BarcodeRedirectorTest extends KernelTestCase { yield [new LocalBarcodeScanResult(LabelSupportedElement::PART, 1, BarcodeSourceType::INTERNAL), '/en/part/1']; //Part lot redirects to Part info page (Part lot 1 is associated with part 3) - yield [new LocalBarcodeScanResult(LabelSupportedElement::PART_LOT, 1, BarcodeSourceType::INTERNAL), '/en/part/3']; + yield [new LocalBarcodeScanResult(LabelSupportedElement::PART_LOT, 1, BarcodeSourceType::INTERNAL), '/en/part/3?highlightLot=1']; yield [new LocalBarcodeScanResult(LabelSupportedElement::STORELOCATION, 1, BarcodeSourceType::INTERNAL), '/en/store_location/1/parts']; }