. */ declare(strict_types=1); namespace App\Services\InfoProviderSystem\Providers; use App\Exceptions\ProviderIDNotSupportedException; use App\Helpers\RandomizeUseragentHttpClient; use App\Services\AI\AIPlatformRegistry; use App\Services\InfoProviderSystem\CreateFromUrlHelper; use App\Services\InfoProviderSystem\DTOJsonSchemaConverter; use App\Services\InfoProviderSystem\DTOs\PartDetailDTO; use App\Settings\InfoProviderSystem\AIExtractorSettings; use Brick\Schema\SchemaReader; use Jkphl\Micrometa; use League\HTMLToMarkdown\HtmlConverter; use Psr\Cache\CacheItemPoolInterface; use Symfony\AI\Platform\Message\Message; use Symfony\AI\Platform\Message\MessageBag; use Symfony\Component\DomCrawler\Crawler; use Symfony\Component\HttpClient\NoPrivateNetworkHttpClient; use Symfony\Component\Intl\Languages; use Symfony\Contracts\HttpClient\HttpClientInterface; use function Symfony\Component\String\u; final class AIWebProvider implements InfoProviderInterface { use FixAndValidateUrlTrait; private const DISTRIBUTOR_NAME = 'Website'; private readonly HttpClientInterface $httpClient; public function __construct( HttpClientInterface $httpClient, private readonly AIExtractorSettings $settings, private readonly AIPlatformRegistry $AIPlatformRegistry, private readonly DTOJsonSchemaConverter $jsonSchemaConverter, private readonly CacheItemPoolInterface $partInfoCache, private readonly CreateFromUrlHelper $createFromUrlHelper, ) { //Use NoPrivateNetworkHttpClient to prevent SSRF vulnerabilities, and RandomizeUseragentHttpClient to make it harder for servers to block us $this->httpClient = (new RandomizeUseragentHttpClient(new NoPrivateNetworkHttpClient($httpClient)))->withOptions( [ 'timeout' => 15, ] ); } public function getProviderInfo(): array { return [ 'name' => 'AI Web Extractor', 'description' => 'Extract part info from any URL using LLM', //'url' => 'https://openrouter.ai', 'disabled_help' => 'Configure AI settings', 'settings_class' => AIExtractorSettings::class, ]; } public function getProviderKey(): string { return 'ai_web'; } public function isActive(): bool { return $this->settings->platform !== null && $this->settings->model !== null && $this->settings->model !== ''; } public function searchByKeyword(string $keyword, array $options = []): array { $url = $this->fixAndValidateURL($keyword); if (!($options[self::OPTION_SKIP_DELEGATION] ?? false)) { //Before loading the page, try to delegate to another provider $delegatedPart = $this->createFromUrlHelper->delegateToOtherProvider($url, $this); if ($delegatedPart !== null) { return [$delegatedPart]; } } try { $new_options = $options; $new_options[self::OPTION_SKIP_DELEGATION] = true; //Skip delegation for the getDetails call to prevent infinite loops return [ $this->getDetails($keyword, $new_options) ]; } catch (ProviderIDNotSupportedException $e) { return []; } } public function getDetails(string $id, array $options = []): PartDetailDTO { $url = $this->fixAndValidateURL($id); if (!($options[self::OPTION_SKIP_DELEGATION] ?? false)) { //Before loading the page, try to delegate to another provider $delegatedPart = $this->createFromUrlHelper->delegateToOtherProviderDetails($url, $this); if ($delegatedPart !== null) { return $delegatedPart; } } //Check if we have a cached result for this URL, to avoid unnecessary LLM calls, which can be slow and costly. $cacheKey = 'ai_web_'.hash('xxh3', $url); //If ignore cache option is set, skip cache and fetch fresh data if ($options[self::OPTION_NO_CACHE] ?? false) { $this->partInfoCache->deleteItem($cacheKey); } //Return cached result if available $cacheItem = $this->partInfoCache->getItem($cacheKey); if ($cacheItem->isHit()) { return $cacheItem->get(); } // Fetch HTML content $response = $this->httpClient->request('GET', $url); $html = $response->getContent(); //Convert html to markdown, to provide a cleaner input to the LLM. $markdown = $this->htmlToMarkdown($html); //Truncate markdown to max content length, if needed $markdown = u($markdown)->truncate($this->settings->maxContentLength, '... [truncated]')->toString(); //Extract structured data using traditional methods, to provide additional context to the LLM. This can help improve accuracy, especially for technical specifications that might be in tables or specific formats. $structuredData = $this->extractStructuredData($html, $url); // Call LLM $llmResponse = $this->callLLM($markdown, $url, $structuredData); // Build and return PartDetailDTO $result = $this->jsonSchemaConverter->jsonToDTO($llmResponse, $this->getProviderKey(), $url, $url, self::DISTRIBUTOR_NAME); // Cache the result for future use, to improve performance and reduce costs. $cacheItem->set($result); $cacheItem->expiresAfter(3600 * 2); //Cache for 2 hours, as web content can change frequently, but we still want to benefit from caching for repeated accesses. $this->partInfoCache->save($cacheItem); return $result; } /** * Extracts structured data from the HTML using microformats. * @param string $html * @param string $url * @return string JSON encoded structured data */ private function extractStructuredData(string $html, string $url): string { $micrometa = new Micrometa\Ports\Parser(); $items = $micrometa($url, $html); return json_encode($items->toObject(), JSON_THROW_ON_ERROR); } private function htmlToMarkdown(string $html): string { //Extract only the main content of the page to avoid overwhelming the LLM with irrelevant information. $crawler = new Crawler($html); $mainContent = $crawler->filter('main, article, #content'); // If we found a specific content area, get its HTML; otherwise, use the whole body. //Concat the html of all matched nodes, to provide more context to the LLM, especially for pages that use multiple sections for product info. if ($mainContent->count() > 0) { $htmlToConvert = ''; foreach ($mainContent as $node) { $htmlToConvert .= $node->ownerDocument->saveHTML($node); $htmlToConvert .= "\n\n"; // Add some spacing between sections } } else { //Use the whole body content, as it might contain relevant information, especially for simpler pages that don't have a clear main/content section. $htmlToConvert = $html; } //Concert to markdown $converter = new HtmlConverter([ 'strip_tags' => true, // Removes tags that aren't Markdown-compatible (like