Allow to POST pages from a browser plugin to Part-DB so it can retrieve it later

This commit is contained in:
Jan Böhmer 2026-05-14 12:54:08 +02:00
parent 6a3be77ec0
commit e33c13ecfa
7 changed files with 303 additions and 14 deletions

View file

@ -0,0 +1,78 @@
<?php
/*
* This file is part of Part-DB (https://github.com/Part-DB/Part-DB-symfony).
*
* Copyright (C) 2019 - 2026 Jan Böhmer (https://github.com/jbtronics)
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
declare(strict_types=1);
namespace App\Controller;
use App\Services\InfoProviderSystem\SubmittedPageStorage;
use App\Services\InfoProviderSystem\DTOs\BrowserSubmittedPage;
use Symfony\Bundle\FrameworkBundle\Controller\AbstractController;
use Symfony\Component\HttpFoundation\JsonResponse;
use Symfony\Component\HttpFoundation\Request;
use Symfony\Component\HttpFoundation\Response;
use Symfony\Component\HttpKernel\Attribute\MapRequestPayload;
use Symfony\Component\Routing\Attribute\Route;
use Symfony\Component\Routing\Generator\UrlGeneratorInterface;
/**
* Provides the endpoint used by browser extensions to submit the current page's HTML to Part-DB,
* so that info providers can use it instead of fetching the URL themselves.
*/
#[Route('/tools/info_providers')]
class BrowserPluginController extends AbstractController
{
private const MAX_HTML_SIZE = 5 * 1024 * 1024; // 5 MB
public function __construct(private readonly SubmittedPageStorage $browserHtmlStorage)
{
}
/**
* Accepts a JSON POST body with the HTML of the current page from a browser extension.
* Stores the HTML in the session via BrowserHtmlSessionStorage and returns a redirect URL
* pointing to the standard part-creation flow with use_browser_html=1.
*
* Expected JSON body: { "html": "<full page HTML>", "url": "https://example.com/product", "provider": "generic_web" }
* The "provider" field is optional and defaults to "generic_web". Use "ai_web" for the AI extractor.
* Response: { "redirect_url": "https://partdb.example.com/en/part/from_info_provider/generic_web/https%3A%2F%2F.../create?use_browser_html=1&no_cache=1" }
*/
#[Route('/browser_html', name: 'browser_plugin_submit_html', methods: ['POST'])]
public function submitHtml(Request $request,
#[MapRequestPayload]
BrowserSubmittedPage $page
): JsonResponse
{
$this->denyAccessUnlessGranted('@info_providers.create_parts');
$provider = (string) ($data['provider'] ?? 'generic_web');
// The maprequestpayload already validates the URL and HTML content:
$token = $this->browserHtmlStorage->store($page);
$redirectUrl = $this->generateUrl('info_providers_create_part', [
'providerKey' => $provider,
'providerId' => $page->url,
'submitted_page_token' => $token,
], UrlGeneratorInterface::ABSOLUTE_URL);
return new JsonResponse(['redirect_url' => $redirectUrl]);
}
}

View file

@ -0,0 +1,50 @@
<?php
/*
* This file is part of Part-DB (https://github.com/Part-DB/Part-DB-symfony).
*
* Copyright (C) 2019 - 2026 Jan Böhmer (https://github.com/jbtronics)
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
declare(strict_types=1);
namespace App\Services\InfoProviderSystem\DTOs;
use Symfony\Component\Validator\Constraints as Assert;
/**
* Represents a webpage submitted by the browser extension, held temporarily in the application cache.
*/
final readonly class BrowserSubmittedPage
{
/**
* @var string A unique token for this page, derived from the URL and HTML content. Used to identify the page in the cache without storing the full HTML in the session.
*/
public string $token;
public function __construct(
#[Assert\Url()]
#[Assert\NotBlank]
public string $url,
#[Assert\NotBlank]
#[Assert\Length(max: 5 * 1024 * 1024)] // Limit to 5 MB to prevent abuse
public string $html,
#[Assert\NotBlank]
public string $title,
public \DateTimeImmutable $submittedAt = new \DateTimeImmutable(),
) {
$this->token = hash('xxh3', $url . '|' . $html);
}
}

View file

@ -175,15 +175,15 @@ final class PartInfoRetriever
*/
public function dtoToPart(PartDetailDTO $search_result): Part
{
return $this->createPart($search_result->provider_key, $search_result->provider_id);
return $this->dto_to_entity_converter->convertPart($search_result);
}
/**
* Use the given details to create a part entity
*/
public function createPart(string $provider_key, string $part_id): Part
public function createPart(string $provider_key, string $part_id, array $options): Part
{
$details = $this->getDetails($provider_key, $part_id);
$details = $this->getDetails($provider_key, $part_id, $options);
return $this->dto_to_entity_converter->convertPart($details);
}

View file

@ -27,12 +27,11 @@ namespace App\Services\InfoProviderSystem\Providers;
use App\Exceptions\ProviderIDNotSupportedException;
use App\Helpers\RandomizeUseragentHttpClient;
use App\Services\AI\AIPlatformRegistry;
use App\Services\InfoProviderSystem\SubmittedPageStorage;
use App\Services\InfoProviderSystem\CreateFromUrlHelper;
use App\Services\InfoProviderSystem\DTOJsonSchemaConverter;
use App\Services\InfoProviderSystem\DTOs\PartDetailDTO;
use App\Settings\InfoProviderSystem\AIExtractorSettings;
use Brick\Schema\SchemaReader;
use Imagine\Image\Format;
use Jkphl\Micrometa;
use League\HTMLToMarkdown\HtmlConverter;
use Psr\Cache\CacheItemPoolInterface;
@ -62,6 +61,7 @@ final class AIWebProvider implements InfoProviderInterface
private readonly DTOJsonSchemaConverter $jsonSchemaConverter,
private readonly CacheItemPoolInterface $partInfoCache,
private readonly CreateFromUrlHelper $createFromUrlHelper,
private readonly SubmittedPageStorage $browserHtmlStorage,
) {
//Use NoPrivateNetworkHttpClient to prevent SSRF vulnerabilities, and RandomizeUseragentHttpClient to make it harder for servers to block us
$this->httpClient = (new RandomizeUseragentHttpClient(new NoPrivateNetworkHttpClient($httpClient)))->withOptions(
@ -142,9 +142,17 @@ final class AIWebProvider implements InfoProviderInterface
return $cacheItem->get();
}
// Fetch HTML content
$response = $this->httpClient->request('GET', $url);
$html = $response->getContent();
// Use pre-fetched browser HTML if the option is set and a stored page is available for this URL
$html = null;
if (($token = ($options[self::OPTION_SUBMITTED_PAGE_TOKEN] ?? '')) !== '') {
$html = $this->browserHtmlStorage->retrieve($token)?->html;
}
//Otherwise fetch it ourselves.
if ($html === null) {
$response = $this->httpClient->request('GET', $url);
$html = $response->getContent();
}
//Convert html to markdown, to provide a cleaner input to the LLM.
$markdown = $this->htmlToMarkdown($html, $url);
@ -176,9 +184,20 @@ final class AIWebProvider implements InfoProviderInterface
*/
private function extractStructuredData(string $html, string $url): string
{
//Only parse microdata, json-ld and rdfa, as they are the most common formats for structured data on product pages. Links and microformat only create clutter for the LLM
$micrometa = new Micrometa\Ports\Parser(Micrometa\Ports\Format::JSON_LD | Micrometa\Ports\Format::MICRODATA | Micrometa\Ports\Format::RDFA_LITE);
$items = $micrometa($url, $html);
try {
//Only parse microdata, json-ld and rdfa, as they are the most common formats for structured data on product pages. Links and microformat only create clutter for the LLM
$micrometa = new Micrometa\Ports\Parser(Micrometa\Ports\Format::JSON_LD | Micrometa\Ports\Format::MICRODATA | Micrometa\Ports\Format::RDFA_LITE);
$items = $micrometa($url, $html);
} catch (\RuntimeException $exception) {
//If parsing fails, try again without rdfa, as it seems to cause problems on pages like ebay
try {
$micrometa = new Micrometa\Ports\Parser(Micrometa\Ports\Format::JSON_LD | Micrometa\Ports\Format::MICRODATA);
$items = $micrometa($url, $html);
} catch (\RuntimeException $exception) {
//If it still fails, return empty structured data
return '{}';
}
}
return json_encode($items->toObject(), JSON_THROW_ON_ERROR);
}

View file

@ -25,6 +25,7 @@ namespace App\Services\InfoProviderSystem\Providers;
use App\Exceptions\ProviderIDNotSupportedException;
use App\Helpers\RandomizeUseragentHttpClient;
use App\Services\InfoProviderSystem\SubmittedPageStorage;
use App\Services\InfoProviderSystem\CreateFromUrlHelper;
use App\Services\InfoProviderSystem\DTOs\ParameterDTO;
use App\Services\InfoProviderSystem\DTOs\PartDetailDTO;
@ -57,6 +58,7 @@ class GenericWebProvider implements InfoProviderInterface
public function __construct(HttpClientInterface $httpClient, private readonly GenericWebProviderSettings $settings,
private readonly CreateFromUrlHelper $createFromUrlHelper,
private readonly SubmittedPageStorage $browserHtmlStorage,
)
{
//Use NoPrivateNetworkHttpClient to prevent SSRF vulnerabilities, and RandomizeUseragentHttpClient to make it harder for servers to block us
@ -294,9 +296,17 @@ class GenericWebProvider implements InfoProviderInterface
}
}
//Try to get the webpage content
$response = $this->httpClient->request('GET', $url);
$content = $response->getContent();
// Use pre-fetched browser HTML if the option is set and a stored page is available for this URL
$content = null;
if (($token = ($options[self::OPTION_SUBMITTED_PAGE_TOKEN] ?? '')) !== '') {
$content = $this->browserHtmlStorage->retrieve($token)?->html;
}
//Otherwise, fetch the page content ourselves
if ($content === null) {
$response = $this->httpClient->request('GET', $url);
$content = $response->getContent();
}
$dom = new Crawler($content);

View file

@ -30,6 +30,7 @@ interface InfoProviderInterface
{
public const OPTION_NO_CACHE = 'no_cache'; // if set to true, the provider should not use any cache and retrieve fresh data from the source
public const OPTION_SKIP_DELEGATION = 'skip_delegation'; // if set to true, the provider should not delegate the request to other providers, even if it supports delegation.
public const OPTION_SUBMITTED_PAGE_TOKEN = 'submitted_page_token'; // if set to a non-empty string, the provider should use the browser-submitted page with the given token (and retrieve it from BrowserHtmlSessionStorage)
/**
* Get information about this provider

View file

@ -0,0 +1,131 @@
<?php
/*
* This file is part of Part-DB (https://github.com/Part-DB/Part-DB-symfony).
*
* Copyright (C) 2019 - 2026 Jan Böhmer (https://github.com/jbtronics)
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
declare(strict_types=1);
namespace App\Services\InfoProviderSystem;
use App\Services\InfoProviderSystem\DTOs\BrowserSubmittedPage;
use Psr\Cache\CacheItemPoolInterface;
use Symfony\Component\DomCrawler\Crawler;
use Symfony\Component\HttpFoundation\RequestStack;
/**
* Stores browser-submitted pages for the browser extension feature.
*
* Each page is stored as a {@see BrowserSubmittedPage} DTO in the application cache with a short TTL.
* The session holds only a compact list of recently submitted URLs so that pages can be listed
* without bloating the session with HTML content.
*/
class SubmittedPageStorage
{
private const CACHE_KEY_PREFIX = 'browser_plugin_html_';
private const CACHE_TTL = 1800; // 30 minutes
private const SESSION_KEY = 'browser_plugin_recent_urls';
private const MAX_RECENT = 10;
public function __construct(
private readonly RequestStack $requestStack,
private readonly CacheItemPoolInterface $cache,
) {
}
/**
* Stores a submitted page in the cache and records its URL in the session's recent list.
* @return string The token under which the page was stored, derived from the URL and HTML. This token is used to retrieve the page later. It is the same value as $page->token.
*/
public function store(BrowserSubmittedPage $page): string
{
$item = $this->cache->getItem($this->cacheKey($page));
$item->set($page);
$item->expiresAfter(self::CACHE_TTL);
$this->cache->save($item);
$session = $this->requestStack->getSession();
$tokens = array_values(array_filter(
$session->get(self::SESSION_KEY, []),
static fn(string $u): bool => $u !== $page->token,
));
array_unshift($tokens, $page->url);
$session->set(self::SESSION_KEY, array_slice($tokens, 0, self::MAX_RECENT));
return $page->token;
}
/**
* Retrieves the stored page via its token (which is derived from the URL and HTML). Returns null if not found or expired.
*/
public function retrieve(string $token): ?BrowserSubmittedPage
{
$item = $this->cache->getItem($this->cacheKey($token));
if (!$item->isHit()) {
return null;
}
return $item->get();
}
/**
* Returns the list of recently submitted pages, newest first.
* Pages whose cache entry has expired are silently omitted.
* The list depends on the session and thus is per-browser and per-user.
*
* @return BrowserSubmittedPage[]
*/
public function getRecentPages(): array
{
$tokens = $this->requestStack->getSession()->get(self::SESSION_KEY, []);
$pages = [];
foreach ($tokens as $token) {
$page = $this->retrieve($token);
if ($page !== null) {
$pages[] = $page;
}
}
return $pages;
}
/**
* Removes a page from both the cache and the recent list.
* @param BrowserSubmittedPage|string $page The page or its token to remove.
*/
public function remove(BrowserSubmittedPage|string $page): void
{
$this->cache->deleteItem($this->cacheKey($page));
$token = is_string($page) ? $page : $page->token;
$session = $this->requestStack->getSession();
//Remove the token from the recent list in the session:
$tokens = array_values(array_filter(
$session->get(self::SESSION_KEY, []),
static fn(string $u): bool => $u !== $token
));
$session->set(self::SESSION_KEY, $tokens);
}
private function cacheKey(BrowserSubmittedPage|string $token): string
{
if (!is_string($token)) {
$token = $token->token;
}
return self::CACHE_KEY_PREFIX . $token;
}
}