diff --git a/docs/usage/information_provider_system.md b/docs/usage/information_provider_system.md index da8ea32b..6cdb5183 100644 --- a/docs/usage/information_provider_system.md +++ b/docs/usage/information_provider_system.md @@ -96,6 +96,21 @@ The following providers are currently available and shipped with Part-DB: (All trademarks are property of their respective owners. Part-DB is not affiliated with any of the companies.) +### Generic Web URL Provider +The Generic Web URL Provider can extract part information from any webpage that contains structured data in the form of +[Schema.org](https://schema.org/) format. Many e-commerce websites use this format to provide detailed product information +for search engines and other services. Therefore it allows Part-DB to retrieve rudimentary part information (like name, image and price) +from a wide range of websites without the need for a dedicated API integration. +To use the Generic Web URL Provider, simply enable it in the information provider settings. No additional configuration +is required. Afterwards you can enter any product URL in the search field, and Part-DB will attempt to extract the relevant part information +from the webpage. + +Please note that if this provider is enabled, Part-DB will make HTTP requests to external websites to fetch product data, which +may have privacy and security implications. + +Following env configuration options are available: +* `PROVIDER_GENERIC_WEB_ENABLED`: Set this to `1` to enable the Generic Web URL Provider (optional, default: `0`) + ### Octopart The Octopart provider uses the [Octopart / Nexar API](https://nexar.com/api) to search for parts and get information. diff --git a/src/Controller/InfoProviderController.php b/src/Controller/InfoProviderController.php index e5a5d87b..deec8a57 100644 --- a/src/Controller/InfoProviderController.php +++ b/src/Controller/InfoProviderController.php @@ -30,6 +30,7 @@ use App\Form\InfoProviderSystem\PartSearchType; use App\Services\InfoProviderSystem\ExistingPartFinder; use App\Services\InfoProviderSystem\PartInfoRetriever; use App\Services\InfoProviderSystem\ProviderRegistry; +use App\Services\InfoProviderSystem\Providers\GenericWebProvider; use App\Settings\AppSettings; use App\Settings\InfoProviderSystem\InfoProviderGeneralSettings; use Doctrine\ORM\EntityManagerInterface; @@ -39,6 +40,7 @@ use Psr\Log\LoggerInterface; use Symfony\Bridge\Doctrine\Attribute\MapEntity; use Symfony\Bundle\FrameworkBundle\Controller\AbstractController; use Symfony\Component\Form\Extension\Core\Type\SubmitType; +use Symfony\Component\Form\Extension\Core\Type\UrlType; use Symfony\Component\HttpClient\Exception\ClientException; use Symfony\Component\HttpClient\Exception\TransportException; use Symfony\Component\HttpFoundation\Request; @@ -208,4 +210,58 @@ class InfoProviderController extends AbstractController 'update_target' => $update_target ]); } + + #[Route('/from_url', name: 'info_providers_from_url')] + public function fromURL(Request $request, GenericWebProvider $provider): Response + { + $this->denyAccessUnlessGranted('@info_providers.create_parts'); + + if (!$provider->isActive()) { + $this->addFlash('error', "Generic Web Provider is not active. Please enable it in the provider settings."); + return $this->redirectToRoute('info_providers_list'); + } + + $formBuilder = $this->createFormBuilder(); + $formBuilder->add('url', UrlType::class, [ + 'label' => 'info_providers.from_url.url.label', + 'required' => true, + ]); + $formBuilder->add('submit', SubmitType::class, [ + 'label' => 'info_providers.search.submit', + ]); + + $form = $formBuilder->getForm(); + $form->handleRequest($request); + + $partDetail = null; + if ($form->isSubmitted() && $form->isValid()) { + //Try to retrieve the part detail from the given URL + $url = $form->get('url')->getData(); + try { + $searchResult = $this->infoRetriever->searchByKeyword( + keyword: $url, + providers: [$provider] + ); + + if (count($searchResult) === 0) { + $this->addFlash('warning', t('info_providers.from_url.no_part_found')); + } else { + $searchResult = $searchResult[0]; + //Redirect to the part creation page with the found part detail + return $this->redirectToRoute('info_providers_create_part', [ + 'providerKey' => $searchResult->provider_key, + 'providerId' => $searchResult->provider_id, + ]); + } + } catch (ExceptionInterface $e) { + $this->addFlash('error', t('info_providers.search.error.general_exception', ['%type%' => (new \ReflectionClass($e))->getShortName()])); + } + } + + return $this->render('info_providers/from_url/from_url.html.twig', [ + 'form' => $form, + 'partDetail' => $partDetail, + ]); + + } } diff --git a/src/Exceptions/ProviderIDNotSupportedException.php b/src/Exceptions/ProviderIDNotSupportedException.php new file mode 100644 index 00000000..429f43ea --- /dev/null +++ b/src/Exceptions/ProviderIDNotSupportedException.php @@ -0,0 +1,32 @@ +. + */ + +declare(strict_types=1); + + +namespace App\Exceptions; + +class ProviderIDNotSupportedException extends \RuntimeException +{ + public function fromProvider(string $providerKey, string $id): self + { + return new self(sprintf('The given ID %s is not supported by the provider %s.', $id, $providerKey,)); + } +} diff --git a/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php b/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php new file mode 100644 index 00000000..4b73ad6e --- /dev/null +++ b/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php @@ -0,0 +1,336 @@ +. + */ + +declare(strict_types=1); + + +namespace App\Services\InfoProviderSystem\Providers; + +use App\Exceptions\ProviderIDNotSupportedException; +use App\Services\InfoProviderSystem\DTOs\ParameterDTO; +use App\Services\InfoProviderSystem\DTOs\PartDetailDTO; +use App\Services\InfoProviderSystem\DTOs\PriceDTO; +use App\Services\InfoProviderSystem\DTOs\PurchaseInfoDTO; +use App\Settings\InfoProviderSystem\GenericWebProviderSettings; +use PhpOffice\PhpSpreadsheet\Calculation\Financial\Securities\Price; +use Symfony\Component\DomCrawler\Crawler; +use Symfony\Contracts\HttpClient\HttpClientInterface; + +class GenericWebProvider implements InfoProviderInterface +{ + + public const DISTRIBUTOR_NAME = 'Website'; + + private readonly HttpClientInterface $httpClient; + + public function __construct(HttpClientInterface $httpClient, private readonly GenericWebProviderSettings $settings) + { + $this->httpClient = $httpClient->withOptions( + [ + 'headers' => [ + 'User-Agent' => 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/144.0.0.0 Safari/537.36', + ], + 'timeout' => 15, + ] + ); + } + + public function getProviderInfo(): array + { + return [ + 'name' => 'Generic Web URL', + 'description' => 'Tries to extract a part from a given product webpage URL using common metadata standards like JSON-LD and OpenGraph.', + //'url' => 'https://example.com', + 'disabled_help' => 'Enable in settings to use this provider', + 'settings_class' => GenericWebProviderSettings::class, + ]; + } + + public function getProviderKey(): string + { + return 'generic_web'; + } + + public function isActive(): bool + { + return $this->settings->enabled; + } + + public function searchByKeyword(string $keyword): array + { + try { + return [ + $this->getDetails($keyword) + ]; } catch (ProviderIDNotSupportedException $e) { + return []; + } + } + + private function extractShopName(string $url): string + { + $host = parse_url($url, PHP_URL_HOST); + if ($host === false || $host === null) { + return self::DISTRIBUTOR_NAME; + } + return $host; + } + + private function productJsonLdToPart(array $jsonLd, string $url, Crawler $dom): PartDetailDTO + { + $notes = $jsonLd['description'] ?? ""; + if (isset($jsonLd['disambiguatingDescription'])) { + if (!empty($notes)) { + $notes .= "\n\n"; + } + $notes .= $jsonLd['disambiguatingDescription']; + } + + $vendor_infos = null; + if (isset($jsonLd['offers'])) { + + if (array_is_list($jsonLd['offers'])) { + $offer = $jsonLd['offers'][0]; + } else { + $offer = $jsonLd['offers']; + } + + //Make $jsonLd['url'] absolute if it's relative + if (isset($jsonLd['url']) && parse_url($jsonLd['url'], PHP_URL_SCHEME) === null) { + $parsedUrl = parse_url($url); + $scheme = $parsedUrl['scheme'] ?? 'https'; + $host = $parsedUrl['host'] ?? ''; + $jsonLd['url'] = $scheme.'://'.$host.$jsonLd['url']; + } + + $prices = []; + if (isset($offer['price'])) { + $prices[] = new PriceDTO( + minimum_discount_amount: 1, + price: (string) $offer['price'], + currency_iso_code: $offer['priceCurrency'] ?? null + ); + } else if (isset($offer['offers']) && array_is_list($offer['offers'])) { + //Some sites nest offers + foreach ($offer['offers'] as $subOffer) { + if (isset($subOffer['price'])) { + $prices[] = new PriceDTO( + minimum_discount_amount: 1, + price: (string) $subOffer['price'], + currency_iso_code: $subOffer['priceCurrency'] ?? null + ); + } + } + } + + $vendor_infos = [new PurchaseInfoDTO( + distributor_name: $this->extractShopName($url), + order_number: (string) ($jsonLd['sku'] ?? $jsonLd['@id'] ?? $jsonLd['gtin'] ?? 'Unknown'), + prices: $prices, + product_url: $jsonLd['url'] ?? $url, + )]; + } + + $image = null; + if (isset($jsonLd['image'])) { + if (is_array($jsonLd['image'])) { + if (array_is_list($jsonLd['image'])) { + $image = $jsonLd['image'][0] ?? null; + } + } elseif (is_string($jsonLd['image'])) { + $image = $jsonLd['image']; + } + } + //If image is an object with @type ImageObject, extract the url + if (is_array($image) && isset($image['@type']) && $image['@type'] === 'ImageObject') { + $image = $image['contentUrl'] ?? $image['url'] ?? null; + } + + //Try to extract parameters from additionalProperty + $parameters = []; + if (isset($jsonLd['additionalProperty']) && array_is_list($jsonLd['additionalProperty'])) { + foreach ($jsonLd['additionalProperty'] as $property) { //TODO: Handle minValue and maxValue + if (isset ($property['unitText'])) { + $parameters[] = ParameterDTO::parseValueField( + name: $property['name'] ?? 'Unknown', + value: $property['value'] ?? '', + unit: $property['unitText'] + ); + } else { + $parameters[] = ParameterDTO::parseValueIncludingUnit( + name: $property['name'] ?? 'Unknown', + value: $property['value'] ?? '' + ); + } + } + } + + + return new PartDetailDTO( + provider_key: $this->getProviderKey(), + provider_id: $url, + name: $jsonLd ['name'] ?? 'Unknown Name', + description: $this->getMetaContent($dom, 'og:description') ?? $this->getMetaContent($dom, 'description') ?? '', + category: isset($jsonLd['category']) && is_string($jsonLd['category']) ? $jsonLd['category'] : null, + manufacturer: $jsonLd['manufacturer']['name'] ?? $jsonLd['brand']['name'] ?? null, + mpn: $jsonLd['mpn'] ?? null, + preview_image_url: $image, + provider_url: $url, + notes: $notes, + parameters: $parameters, + vendor_infos: $vendor_infos, + mass: isset($jsonLd['weight']['value']) ? (float)$jsonLd['weight']['value'] : null, + ); + } + + /** + * Decodes JSON in a forgiving way, trying to fix common issues. + * @param string $json + * @return array + * @throws \JsonException + */ + private function json_decode_forgiving(string $json): array + { + //Sanitize common issues + $json = preg_replace("/[\r\n]+/", " ", $json); + return json_decode($json, true, 512, JSON_THROW_ON_ERROR); + } + + private function getMetaContent(Crawler $dom, string $name): ?string + { + $meta = $dom->filter('meta[property="'.$name.'"]'); + if ($meta->count() > 0) { + return $meta->attr('content'); + } + + //Try name attribute + $meta = $dom->filter('meta[name="'.$name.'"]'); + if ($meta->count() > 0) { + return $meta->attr('content'); + } + + return null; + } + + public function getDetails(string $id): PartDetailDTO + { + //Add scheme if missing + if (!preg_match('/^https?:\/\//', $id)) { + //Remove any leading slashes + $id = ltrim($id, '/'); + + $id = 'https://'.$id; + } + + $url = $id; + + //If this is not a valid URL with host, domain and path, throw an exception + if (filter_var($url, FILTER_VALIDATE_URL) === false || + parse_url($url, PHP_URL_HOST) === null || + parse_url($url, PHP_URL_PATH) === null) { + throw new ProviderIDNotSupportedException("The given ID is not a valid URL: ".$id); + } + + //Try to get the webpage content + $response = $this->httpClient->request('GET', $url); + $content = $response->getContent(); + + $dom = new Crawler($content); + + //Try to determine a canonical URL + $canonicalURL = $url; + if ($dom->filter('link[rel="canonical"]')->count() > 0) { + $canonicalURL = $dom->filter('link[rel="canonical"]')->attr('href'); + } else if ($dom->filter('meta[property="og:url"]')->count() > 0) { + $canonicalURL = $dom->filter('meta[property="og:url"]')->attr('content'); + } + + //If the canonical URL is relative, make it absolute + if (parse_url($canonicalURL, PHP_URL_SCHEME) === null) { + $parsedUrl = parse_url($url); + $scheme = $parsedUrl['scheme'] ?? 'https'; + $host = $parsedUrl['host'] ?? ''; + $canonicalURL = $scheme.'://'.$host.$canonicalURL; + } + + //Try to find json-ld data in the head + $jsonLdNodes = $dom->filter('script[type="application/ld+json"]'); + foreach ($jsonLdNodes as $node) { + $jsonLd = $this->json_decode_forgiving($node->textContent); + //If the content of json-ld is an array, try to find a product inside + if (!array_is_list($jsonLd)) { + $jsonLd = [$jsonLd]; + } + foreach ($jsonLd as $item) { + if (isset($item['@type']) && $item['@type'] === 'Product') { + return $this->productJsonLdToPart($item, $canonicalURL, $dom); + } + } + } + + //If no JSON-LD data is found, try to extract basic data from meta tags + $pageTitle = $dom->filter('title')->count() > 0 ? $dom->filter('title')->text() : 'Unknown'; + + $prices = []; + if ($price = $this->getMetaContent($dom, 'product:price:amount')) { + $prices[] = new PriceDTO( + minimum_discount_amount: 1, + price: $price, + currency_iso_code: $this->getMetaContent($dom, 'product:price:currency'), + ); + } else { + //Amazon fallback + $amazonAmount = $dom->filter('input[type="hidden"][name*="amount"]'); + if ($amazonAmount->count() > 0) { + $prices[] = new PriceDTO( + minimum_discount_amount: 1, + price: $amazonAmount->first()->attr('value'), + currency_iso_code: $dom->filter('input[type="hidden"][name*="currencyCode"]')->first()->attr('value'), + ); + } + } + + $vendor_infos = [new PurchaseInfoDTO( + distributor_name: $this->extractShopName($canonicalURL), + order_number: 'Unknown', + prices: $prices, + product_url: $canonicalURL, + )]; + + return new PartDetailDTO( + provider_key: $this->getProviderKey(), + provider_id: $canonicalURL, + name: $this->getMetaContent($dom, 'og:title') ?? $pageTitle, + description: $this->getMetaContent($dom, 'og:description') ?? $this->getMetaContent($dom, 'description') ?? '', + manufacturer: $this->getMetaContent($dom, 'product:brand'), + preview_image_url: $this->getMetaContent($dom, 'og:image'), + provider_url: $canonicalURL, + vendor_infos: $vendor_infos, + ); + } + + public function getCapabilities(): array + { + return [ + ProviderCapabilities::BASIC, + ProviderCapabilities::PICTURE, + ProviderCapabilities::PRICE + ]; + } +} diff --git a/src/Services/Trees/ToolsTreeBuilder.php b/src/Services/Trees/ToolsTreeBuilder.php index 37a09b09..c8afac12 100644 --- a/src/Services/Trees/ToolsTreeBuilder.php +++ b/src/Services/Trees/ToolsTreeBuilder.php @@ -39,6 +39,8 @@ use App\Entity\UserSystem\User; use App\Helpers\Trees\TreeViewNode; use App\Services\Cache\UserCacheKeyGenerator; use App\Services\ElementTypeNameGenerator; +use App\Services\InfoProviderSystem\Providers\GenericWebProvider; +use App\Settings\InfoProviderSystem\GenericWebProviderSettings; use Symfony\Bundle\SecurityBundle\Security; use Symfony\Component\Routing\Generator\UrlGeneratorInterface; use Symfony\Contracts\Cache\ItemInterface; @@ -58,6 +60,7 @@ class ToolsTreeBuilder protected UserCacheKeyGenerator $keyGenerator, protected Security $security, private readonly ElementTypeNameGenerator $elementTypeNameGenerator, + private readonly GenericWebProviderSettings $genericWebProviderSettings ) { } @@ -147,6 +150,13 @@ class ToolsTreeBuilder $this->urlGenerator->generate('info_providers_search') ))->setIcon('fa-treeview fa-fw fa-solid fa-cloud-arrow-down'); + if ($this->genericWebProviderSettings->enabled) { + $nodes[] = (new TreeViewNode( + $this->translator->trans('info_providers.from_url.title'), + $this->urlGenerator->generate('info_providers_from_url') + ))->setIcon('fa-treeview fa-fw fa-solid fa-book-atlas'); + } + $nodes[] = (new TreeViewNode( $this->translator->trans('info_providers.bulk_import.manage_jobs'), $this->urlGenerator->generate('bulk_info_provider_manage') diff --git a/src/Settings/InfoProviderSystem/GenericWebProviderSettings.php b/src/Settings/InfoProviderSystem/GenericWebProviderSettings.php new file mode 100644 index 00000000..07972141 --- /dev/null +++ b/src/Settings/InfoProviderSystem/GenericWebProviderSettings.php @@ -0,0 +1,43 @@ +. + */ + +declare(strict_types=1); + + +namespace App\Settings\InfoProviderSystem; + +use App\Settings\SettingsIcon; +use Jbtronics\SettingsBundle\Metadata\EnvVarMode; +use Jbtronics\SettingsBundle\Settings\Settings; +use Jbtronics\SettingsBundle\Settings\SettingsParameter; +use Jbtronics\SettingsBundle\Settings\SettingsTrait; +use Symfony\Component\Translation\TranslatableMessage as TM; + +#[Settings(name: "generic_web_provider", label: new TM("settings.ips.generic_web_provider"), description: new TM("settings.ips.generic_web_provider.description"))] +#[SettingsIcon("fa-plug")] +class GenericWebProviderSettings +{ + use SettingsTrait; + + #[SettingsParameter(label: new TM("settings.ips.lcsc.enabled"), description: new TM("settings.ips.generic_web_provider.enabled.help"), + envVar: "bool:PROVIDER_GENERIC_WEB_ENABLED", envVarMode: EnvVarMode::OVERWRITE + )] + public bool $enabled = false; +} diff --git a/src/Settings/InfoProviderSystem/InfoProviderSettings.php b/src/Settings/InfoProviderSystem/InfoProviderSettings.php index fb31bdb9..3e78233f 100644 --- a/src/Settings/InfoProviderSystem/InfoProviderSettings.php +++ b/src/Settings/InfoProviderSystem/InfoProviderSettings.php @@ -37,6 +37,9 @@ class InfoProviderSettings #[EmbeddedSettings] public ?InfoProviderGeneralSettings $general = null; + #[EmbeddedSettings] + public ?GenericWebProviderSettings $genericWebProvider = null; + #[EmbeddedSettings] public ?DigikeySettings $digikey = null; diff --git a/templates/_navbar.html.twig b/templates/_navbar.html.twig index 446ccdab..c4dfbe0f 100644 --- a/templates/_navbar.html.twig +++ b/templates/_navbar.html.twig @@ -10,9 +10,9 @@ - {% if is_granted("@tools.label_scanner") %} + {% if is_granted("@tools.label_scanner") %} - + {% endif %} @@ -52,6 +52,14 @@ {% trans %}info_providers.search.title{% endtrans %} + {% if settings_instance('generic_web_provider').enabled %} +
{% trans %}info_providers.from_url.help{% endtrans %}
+ + {{ form_start(form) }} + {{ form_row(form.url) }} + {{ form_row(form.submit) }} + {{ form_end(form) }} +{% endblock %} diff --git a/templates/info_providers/settings/provider_settings.html.twig b/templates/info_providers/settings/provider_settings.html.twig index 1876c2eb..86e5bc9b 100644 --- a/templates/info_providers/settings/provider_settings.html.twig +++ b/templates/info_providers/settings/provider_settings.html.twig @@ -10,7 +10,7 @@ {% block card_content %}