From e33c13ecfa43017134b9f6a501f6460de56b4635 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20B=C3=B6hmer?= Date: Thu, 14 May 2026 12:54:08 +0200 Subject: [PATCH] Allow to POST pages from a browser plugin to Part-DB so it can retrieve it later --- src/Controller/BrowserPluginController.php | 78 +++++++++++ .../DTOs/BrowserSubmittedPage.php | 50 +++++++ .../InfoProviderSystem/PartInfoRetriever.php | 6 +- .../Providers/AIWebProvider.php | 35 +++-- .../Providers/GenericWebProvider.php | 16 ++- .../Providers/InfoProviderInterface.php | 1 + .../SubmittedPageStorage.php | 131 ++++++++++++++++++ 7 files changed, 303 insertions(+), 14 deletions(-) create mode 100644 src/Controller/BrowserPluginController.php create mode 100644 src/Services/InfoProviderSystem/DTOs/BrowserSubmittedPage.php create mode 100644 src/Services/InfoProviderSystem/SubmittedPageStorage.php diff --git a/src/Controller/BrowserPluginController.php b/src/Controller/BrowserPluginController.php new file mode 100644 index 00000000..bae63444 --- /dev/null +++ b/src/Controller/BrowserPluginController.php @@ -0,0 +1,78 @@ +. + */ + +declare(strict_types=1); + +namespace App\Controller; + +use App\Services\InfoProviderSystem\SubmittedPageStorage; +use App\Services\InfoProviderSystem\DTOs\BrowserSubmittedPage; +use Symfony\Bundle\FrameworkBundle\Controller\AbstractController; +use Symfony\Component\HttpFoundation\JsonResponse; +use Symfony\Component\HttpFoundation\Request; +use Symfony\Component\HttpFoundation\Response; +use Symfony\Component\HttpKernel\Attribute\MapRequestPayload; +use Symfony\Component\Routing\Attribute\Route; +use Symfony\Component\Routing\Generator\UrlGeneratorInterface; + +/** + * Provides the endpoint used by browser extensions to submit the current page's HTML to Part-DB, + * so that info providers can use it instead of fetching the URL themselves. + */ +#[Route('/tools/info_providers')] +class BrowserPluginController extends AbstractController +{ + private const MAX_HTML_SIZE = 5 * 1024 * 1024; // 5 MB + + public function __construct(private readonly SubmittedPageStorage $browserHtmlStorage) + { + } + + /** + * Accepts a JSON POST body with the HTML of the current page from a browser extension. + * Stores the HTML in the session via BrowserHtmlSessionStorage and returns a redirect URL + * pointing to the standard part-creation flow with use_browser_html=1. + * + * Expected JSON body: { "html": "", "url": "https://example.com/product", "provider": "generic_web" } + * The "provider" field is optional and defaults to "generic_web". Use "ai_web" for the AI extractor. + * Response: { "redirect_url": "https://partdb.example.com/en/part/from_info_provider/generic_web/https%3A%2F%2F.../create?use_browser_html=1&no_cache=1" } + */ + #[Route('/browser_html', name: 'browser_plugin_submit_html', methods: ['POST'])] + public function submitHtml(Request $request, + #[MapRequestPayload] + BrowserSubmittedPage $page + ): JsonResponse + { + $this->denyAccessUnlessGranted('@info_providers.create_parts'); + + $provider = (string) ($data['provider'] ?? 'generic_web'); + + // The maprequestpayload already validates the URL and HTML content: + $token = $this->browserHtmlStorage->store($page); + + $redirectUrl = $this->generateUrl('info_providers_create_part', [ + 'providerKey' => $provider, + 'providerId' => $page->url, + 'submitted_page_token' => $token, + ], UrlGeneratorInterface::ABSOLUTE_URL); + + return new JsonResponse(['redirect_url' => $redirectUrl]); + } +} diff --git a/src/Services/InfoProviderSystem/DTOs/BrowserSubmittedPage.php b/src/Services/InfoProviderSystem/DTOs/BrowserSubmittedPage.php new file mode 100644 index 00000000..0f4fbf5f --- /dev/null +++ b/src/Services/InfoProviderSystem/DTOs/BrowserSubmittedPage.php @@ -0,0 +1,50 @@ +. + */ + +declare(strict_types=1); + +namespace App\Services\InfoProviderSystem\DTOs; + +use Symfony\Component\Validator\Constraints as Assert; + +/** + * Represents a webpage submitted by the browser extension, held temporarily in the application cache. + */ +final readonly class BrowserSubmittedPage +{ + /** + * @var string A unique token for this page, derived from the URL and HTML content. Used to identify the page in the cache without storing the full HTML in the session. + */ + public string $token; + + public function __construct( + #[Assert\Url()] + #[Assert\NotBlank] + public string $url, + #[Assert\NotBlank] + #[Assert\Length(max: 5 * 1024 * 1024)] // Limit to 5 MB to prevent abuse + public string $html, + #[Assert\NotBlank] + public string $title, + public \DateTimeImmutable $submittedAt = new \DateTimeImmutable(), + ) { + $this->token = hash('xxh3', $url . '|' . $html); + } +} diff --git a/src/Services/InfoProviderSystem/PartInfoRetriever.php b/src/Services/InfoProviderSystem/PartInfoRetriever.php index 6c10f10e..f5ff144d 100644 --- a/src/Services/InfoProviderSystem/PartInfoRetriever.php +++ b/src/Services/InfoProviderSystem/PartInfoRetriever.php @@ -175,15 +175,15 @@ final class PartInfoRetriever */ public function dtoToPart(PartDetailDTO $search_result): Part { - return $this->createPart($search_result->provider_key, $search_result->provider_id); + return $this->dto_to_entity_converter->convertPart($search_result); } /** * Use the given details to create a part entity */ - public function createPart(string $provider_key, string $part_id): Part + public function createPart(string $provider_key, string $part_id, array $options): Part { - $details = $this->getDetails($provider_key, $part_id); + $details = $this->getDetails($provider_key, $part_id, $options); return $this->dto_to_entity_converter->convertPart($details); } diff --git a/src/Services/InfoProviderSystem/Providers/AIWebProvider.php b/src/Services/InfoProviderSystem/Providers/AIWebProvider.php index 79f07be8..6539e69b 100644 --- a/src/Services/InfoProviderSystem/Providers/AIWebProvider.php +++ b/src/Services/InfoProviderSystem/Providers/AIWebProvider.php @@ -27,12 +27,11 @@ namespace App\Services\InfoProviderSystem\Providers; use App\Exceptions\ProviderIDNotSupportedException; use App\Helpers\RandomizeUseragentHttpClient; use App\Services\AI\AIPlatformRegistry; +use App\Services\InfoProviderSystem\SubmittedPageStorage; use App\Services\InfoProviderSystem\CreateFromUrlHelper; use App\Services\InfoProviderSystem\DTOJsonSchemaConverter; use App\Services\InfoProviderSystem\DTOs\PartDetailDTO; use App\Settings\InfoProviderSystem\AIExtractorSettings; -use Brick\Schema\SchemaReader; -use Imagine\Image\Format; use Jkphl\Micrometa; use League\HTMLToMarkdown\HtmlConverter; use Psr\Cache\CacheItemPoolInterface; @@ -62,6 +61,7 @@ final class AIWebProvider implements InfoProviderInterface private readonly DTOJsonSchemaConverter $jsonSchemaConverter, private readonly CacheItemPoolInterface $partInfoCache, private readonly CreateFromUrlHelper $createFromUrlHelper, + private readonly SubmittedPageStorage $browserHtmlStorage, ) { //Use NoPrivateNetworkHttpClient to prevent SSRF vulnerabilities, and RandomizeUseragentHttpClient to make it harder for servers to block us $this->httpClient = (new RandomizeUseragentHttpClient(new NoPrivateNetworkHttpClient($httpClient)))->withOptions( @@ -142,9 +142,17 @@ final class AIWebProvider implements InfoProviderInterface return $cacheItem->get(); } - // Fetch HTML content - $response = $this->httpClient->request('GET', $url); - $html = $response->getContent(); + // Use pre-fetched browser HTML if the option is set and a stored page is available for this URL + $html = null; + if (($token = ($options[self::OPTION_SUBMITTED_PAGE_TOKEN] ?? '')) !== '') { + $html = $this->browserHtmlStorage->retrieve($token)?->html; + } + + //Otherwise fetch it ourselves. + if ($html === null) { + $response = $this->httpClient->request('GET', $url); + $html = $response->getContent(); + } //Convert html to markdown, to provide a cleaner input to the LLM. $markdown = $this->htmlToMarkdown($html, $url); @@ -176,9 +184,20 @@ final class AIWebProvider implements InfoProviderInterface */ private function extractStructuredData(string $html, string $url): string { - //Only parse microdata, json-ld and rdfa, as they are the most common formats for structured data on product pages. Links and microformat only create clutter for the LLM - $micrometa = new Micrometa\Ports\Parser(Micrometa\Ports\Format::JSON_LD | Micrometa\Ports\Format::MICRODATA | Micrometa\Ports\Format::RDFA_LITE); - $items = $micrometa($url, $html); + try { + //Only parse microdata, json-ld and rdfa, as they are the most common formats for structured data on product pages. Links and microformat only create clutter for the LLM + $micrometa = new Micrometa\Ports\Parser(Micrometa\Ports\Format::JSON_LD | Micrometa\Ports\Format::MICRODATA | Micrometa\Ports\Format::RDFA_LITE); + $items = $micrometa($url, $html); + } catch (\RuntimeException $exception) { + //If parsing fails, try again without rdfa, as it seems to cause problems on pages like ebay + try { + $micrometa = new Micrometa\Ports\Parser(Micrometa\Ports\Format::JSON_LD | Micrometa\Ports\Format::MICRODATA); + $items = $micrometa($url, $html); + } catch (\RuntimeException $exception) { + //If it still fails, return empty structured data + return '{}'; + } + } return json_encode($items->toObject(), JSON_THROW_ON_ERROR); } diff --git a/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php b/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php index 06a9d4c1..45777f9e 100644 --- a/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php +++ b/src/Services/InfoProviderSystem/Providers/GenericWebProvider.php @@ -25,6 +25,7 @@ namespace App\Services\InfoProviderSystem\Providers; use App\Exceptions\ProviderIDNotSupportedException; use App\Helpers\RandomizeUseragentHttpClient; +use App\Services\InfoProviderSystem\SubmittedPageStorage; use App\Services\InfoProviderSystem\CreateFromUrlHelper; use App\Services\InfoProviderSystem\DTOs\ParameterDTO; use App\Services\InfoProviderSystem\DTOs\PartDetailDTO; @@ -57,6 +58,7 @@ class GenericWebProvider implements InfoProviderInterface public function __construct(HttpClientInterface $httpClient, private readonly GenericWebProviderSettings $settings, private readonly CreateFromUrlHelper $createFromUrlHelper, + private readonly SubmittedPageStorage $browserHtmlStorage, ) { //Use NoPrivateNetworkHttpClient to prevent SSRF vulnerabilities, and RandomizeUseragentHttpClient to make it harder for servers to block us @@ -294,9 +296,17 @@ class GenericWebProvider implements InfoProviderInterface } } - //Try to get the webpage content - $response = $this->httpClient->request('GET', $url); - $content = $response->getContent(); + // Use pre-fetched browser HTML if the option is set and a stored page is available for this URL + $content = null; + if (($token = ($options[self::OPTION_SUBMITTED_PAGE_TOKEN] ?? '')) !== '') { + $content = $this->browserHtmlStorage->retrieve($token)?->html; + } + + //Otherwise, fetch the page content ourselves + if ($content === null) { + $response = $this->httpClient->request('GET', $url); + $content = $response->getContent(); + } $dom = new Crawler($content); diff --git a/src/Services/InfoProviderSystem/Providers/InfoProviderInterface.php b/src/Services/InfoProviderSystem/Providers/InfoProviderInterface.php index a6e073a5..d3895795 100644 --- a/src/Services/InfoProviderSystem/Providers/InfoProviderInterface.php +++ b/src/Services/InfoProviderSystem/Providers/InfoProviderInterface.php @@ -30,6 +30,7 @@ interface InfoProviderInterface { public const OPTION_NO_CACHE = 'no_cache'; // if set to true, the provider should not use any cache and retrieve fresh data from the source public const OPTION_SKIP_DELEGATION = 'skip_delegation'; // if set to true, the provider should not delegate the request to other providers, even if it supports delegation. + public const OPTION_SUBMITTED_PAGE_TOKEN = 'submitted_page_token'; // if set to a non-empty string, the provider should use the browser-submitted page with the given token (and retrieve it from BrowserHtmlSessionStorage) /** * Get information about this provider diff --git a/src/Services/InfoProviderSystem/SubmittedPageStorage.php b/src/Services/InfoProviderSystem/SubmittedPageStorage.php new file mode 100644 index 00000000..f536531b --- /dev/null +++ b/src/Services/InfoProviderSystem/SubmittedPageStorage.php @@ -0,0 +1,131 @@ +. + */ + +declare(strict_types=1); + +namespace App\Services\InfoProviderSystem; + +use App\Services\InfoProviderSystem\DTOs\BrowserSubmittedPage; +use Psr\Cache\CacheItemPoolInterface; +use Symfony\Component\DomCrawler\Crawler; +use Symfony\Component\HttpFoundation\RequestStack; + +/** + * Stores browser-submitted pages for the browser extension feature. + * + * Each page is stored as a {@see BrowserSubmittedPage} DTO in the application cache with a short TTL. + * The session holds only a compact list of recently submitted URLs so that pages can be listed + * without bloating the session with HTML content. + */ +class SubmittedPageStorage +{ + private const CACHE_KEY_PREFIX = 'browser_plugin_html_'; + private const CACHE_TTL = 1800; // 30 minutes + private const SESSION_KEY = 'browser_plugin_recent_urls'; + private const MAX_RECENT = 10; + + public function __construct( + private readonly RequestStack $requestStack, + private readonly CacheItemPoolInterface $cache, + ) { + } + + /** + * Stores a submitted page in the cache and records its URL in the session's recent list. + * @return string The token under which the page was stored, derived from the URL and HTML. This token is used to retrieve the page later. It is the same value as $page->token. + */ + public function store(BrowserSubmittedPage $page): string + { + $item = $this->cache->getItem($this->cacheKey($page)); + $item->set($page); + $item->expiresAfter(self::CACHE_TTL); + $this->cache->save($item); + + $session = $this->requestStack->getSession(); + $tokens = array_values(array_filter( + $session->get(self::SESSION_KEY, []), + static fn(string $u): bool => $u !== $page->token, + )); + array_unshift($tokens, $page->url); + $session->set(self::SESSION_KEY, array_slice($tokens, 0, self::MAX_RECENT)); + + return $page->token; + } + + /** + * Retrieves the stored page via its token (which is derived from the URL and HTML). Returns null if not found or expired. + */ + public function retrieve(string $token): ?BrowserSubmittedPage + { + $item = $this->cache->getItem($this->cacheKey($token)); + if (!$item->isHit()) { + return null; + } + return $item->get(); + } + + /** + * Returns the list of recently submitted pages, newest first. + * Pages whose cache entry has expired are silently omitted. + * The list depends on the session and thus is per-browser and per-user. + * + * @return BrowserSubmittedPage[] + */ + public function getRecentPages(): array + { + $tokens = $this->requestStack->getSession()->get(self::SESSION_KEY, []); + $pages = []; + foreach ($tokens as $token) { + $page = $this->retrieve($token); + if ($page !== null) { + $pages[] = $page; + } + } + return $pages; + } + + /** + * Removes a page from both the cache and the recent list. + * @param BrowserSubmittedPage|string $page The page or its token to remove. + */ + public function remove(BrowserSubmittedPage|string $page): void + { + $this->cache->deleteItem($this->cacheKey($page)); + + $token = is_string($page) ? $page : $page->token; + + $session = $this->requestStack->getSession(); + //Remove the token from the recent list in the session: + $tokens = array_values(array_filter( + $session->get(self::SESSION_KEY, []), + static fn(string $u): bool => $u !== $token + )); + $session->set(self::SESSION_KEY, $tokens); + } + + private function cacheKey(BrowserSubmittedPage|string $token): string + { + if (!is_string($token)) { + $token = $token->token; + } + + return self::CACHE_KEY_PREFIX . $token; + } +}