. */ declare(strict_types=1); namespace App\Services\InfoProviderSystem\Providers; use App\Exceptions\ProviderIDNotSupportedException; use App\Helpers\RandomizeUseragentHttpClient; use App\Services\AI\AIPlatformRegistry; use App\Services\InfoProviderSystem\CreateFromUrlHelper; use App\Services\InfoProviderSystem\DTOJsonSchemaConverter; use App\Services\InfoProviderSystem\DTOs\PartDetailDTO; use App\Settings\InfoProviderSystem\AIExtractorSettings; use Brick\Schema\SchemaReader; use Imagine\Image\Format; use Jkphl\Micrometa; use League\HTMLToMarkdown\HtmlConverter; use Psr\Cache\CacheItemPoolInterface; use Symfony\AI\Platform\Message\Message; use Symfony\AI\Platform\Message\MessageBag; use Symfony\Component\DomCrawler\Crawler; use Symfony\Component\DomCrawler\UriResolver; use Symfony\Component\HttpClient\NoPrivateNetworkHttpClient; use Symfony\Component\Intl\Languages; use Symfony\Contracts\HttpClient\HttpClientInterface; use function Symfony\Component\String\u; final class AIWebProvider implements InfoProviderInterface { use FixAndValidateUrlTrait; private const DISTRIBUTOR_NAME = 'Website'; private readonly HttpClientInterface $httpClient; public function __construct( HttpClientInterface $httpClient, private readonly AIExtractorSettings $settings, private readonly AIPlatformRegistry $AIPlatformRegistry, private readonly DTOJsonSchemaConverter $jsonSchemaConverter, private readonly CacheItemPoolInterface $partInfoCache, private readonly CreateFromUrlHelper $createFromUrlHelper, ) { //Use NoPrivateNetworkHttpClient to prevent SSRF vulnerabilities, and RandomizeUseragentHttpClient to make it harder for servers to block us $this->httpClient = (new RandomizeUseragentHttpClient(new NoPrivateNetworkHttpClient($httpClient)))->withOptions( [ 'timeout' => 15, ] ); } public function getProviderInfo(): array { return [ 'name' => 'AI Web Extractor', 'description' => 'Extract part info from any URL using LLM', //'url' => 'https://openrouter.ai', 'disabled_help' => 'Configure AI settings', 'settings_class' => AIExtractorSettings::class, ]; } public function getProviderKey(): string { return 'ai_web'; } public function isActive(): bool { return $this->settings->platform !== null && $this->settings->model !== null && $this->settings->model !== ''; } public function searchByKeyword(string $keyword, array $options = []): array { $url = $this->fixAndValidateURL($keyword); if (!($options[self::OPTION_SKIP_DELEGATION] ?? false)) { //Before loading the page, try to delegate to another provider $delegatedPart = $this->createFromUrlHelper->delegateToOtherProvider($url, $this); if ($delegatedPart !== null) { return [$delegatedPart]; } } try { $new_options = $options; $new_options[self::OPTION_SKIP_DELEGATION] = true; //Skip delegation for the getDetails call to prevent infinite loops return [ $this->getDetails($keyword, $new_options) ]; } catch (ProviderIDNotSupportedException $e) { return []; } } public function getDetails(string $id, array $options = []): PartDetailDTO { $url = $this->fixAndValidateURL($id); if (!($options[self::OPTION_SKIP_DELEGATION] ?? false)) { //Before loading the page, try to delegate to another provider $delegatedPart = $this->createFromUrlHelper->delegateToOtherProviderDetails($url, $this); if ($delegatedPart !== null) { return $delegatedPart; } } //Check if we have a cached result for this URL, to avoid unnecessary LLM calls, which can be slow and costly. $cacheKey = 'ai_web_'.hash('xxh3', $url); //If ignore cache option is set, skip cache and fetch fresh data if ($options[self::OPTION_NO_CACHE] ?? false) { $this->partInfoCache->deleteItem($cacheKey); } //Return cached result if available $cacheItem = $this->partInfoCache->getItem($cacheKey); if ($cacheItem->isHit()) { return $cacheItem->get(); } // Fetch HTML content $response = $this->httpClient->request('GET', $url); $html = $response->getContent(); //Convert html to markdown, to provide a cleaner input to the LLM. $markdown = $this->htmlToMarkdown($html, $url); //Truncate markdown to max content length, if needed $markdown = u($markdown)->truncate($this->settings->maxContentLength, '... [truncated]')->toString(); //Extract structured data using traditional methods, to provide additional context to the LLM. This can help improve accuracy, especially for technical specifications that might be in tables or specific formats. $structuredData = $this->extractStructuredData($html, $url); // Call LLM $llmResponse = $this->callLLM($markdown, $url, $structuredData); // Build and return PartDetailDTO $result = $this->jsonSchemaConverter->jsonToDTO($llmResponse, $this->getProviderKey(), $url, $url, self::DISTRIBUTOR_NAME); // Cache the result for future use, to improve performance and reduce costs. $cacheItem->set($result); $cacheItem->expiresAfter(3600 * 2); //Cache for 2 hours, as web content can change frequently, but we still want to benefit from caching for repeated accesses. $this->partInfoCache->save($cacheItem); return $result; } /** * Extracts structured data from the HTML using microformats. * @param string $html * @param string $url * @return string JSON encoded structured data */ private function extractStructuredData(string $html, string $url): string { //Only parse microdata, json-ld and rdfa, as they are the most common formats for structured data on product pages. Links and microformat only create clutter for the LLM $micrometa = new Micrometa\Ports\Parser(Micrometa\Ports\Format::JSON_LD | Micrometa\Ports\Format::MICRODATA | Micrometa\Ports\Format::RDFA_LITE); $items = $micrometa($url, $html); return json_encode($items->toObject(), JSON_THROW_ON_ERROR); } private function htmlToMarkdown(string $html, string $url): string { $crawler = new Crawler($html); //Replace relative URLs with absolute URLs, to ensure that the LLM has full context and can access the links if needed. $baseUrl = $crawler->getBaseHref() ?? $url; //Replace all relative links with their absolute counnterparts, to provide more context to the LLM and to ensure that any links included in the markdown are valid and can be accessed if needed. $crawler->filter('a')->each(function (Crawler $node) use ($baseUrl) { $href = $node->attr('href'); if ($href) { $absoluteUrl = UriResolver::resolve($href, $baseUrl); $node->getNode(0)->setAttribute('href', $absoluteUrl); } }); $crawler->filter('img')->each(function (Crawler $node) use ($baseUrl) { $src = $node->attr('src'); if ($src) { $absoluteUrl = UriResolver::resolve($src, $baseUrl); $node->getNode(0)->setAttribute('src', $absoluteUrl); } }); //Extract only the main content of the page to avoid overwhelming the LLM with irrelevant information. $mainContent = $crawler->filter('main, article, #content'); // If we found a specific content area, get its HTML; otherwise, use the whole body. //Concat the html of all matched nodes, to provide more context to the LLM, especially for pages that use multiple sections for product info. if ($mainContent->count() > 0) { $htmlToConvert = ''; foreach ($mainContent as $node) { $htmlToConvert .= $node->ownerDocument->saveHTML($node); $htmlToConvert .= "\n\n"; // Add some spacing between sections } } else { //Use the whole body content, as it might contain relevant information, especially for simpler pages that don't have a clear main/content section. $htmlToConvert = $crawler->outerHtml(); } //Concert to markdown $converter = new HtmlConverter([ 'strip_tags' => true, // Removes tags that aren't Markdown-compatible (like