. */ declare(strict_types=1); namespace App\Services\InfoProviderSystem\Providers; use App\Exceptions\ProviderIDNotSupportedException; use App\Services\AI\AIPlatformRegistry; use App\Services\InfoProviderSystem\DTOJsonSchemaConverter; use App\Services\InfoProviderSystem\DTOs\PartDetailDTO; use App\Settings\InfoProviderSystem\AIExtractorSettings; use Brick\Schema\SchemaReader; use Jkphl\Micrometa; use League\HTMLToMarkdown\HtmlConverter; use Symfony\AI\Platform\Message\Message; use Symfony\AI\Platform\Message\MessageBag; use Symfony\Component\DomCrawler\Crawler; use Symfony\Component\Intl\Languages; use Symfony\Contracts\HttpClient\HttpClientInterface; use function Symfony\Component\String\u; final class AIInfoExtractor implements InfoProviderInterface { use FixAndValidateUrlTrait; private const DISTRIBUTOR_NAME = 'AI Extracted'; private readonly HttpClientInterface $httpClient; public function __construct( HttpClientInterface $httpClient, private readonly AIExtractorSettings $settings, private readonly AIPlatformRegistry $AIPlatformRegistry, private readonly DTOJsonSchemaConverter $jsonSchemaConverter, ) { $this->httpClient = $httpClient->withOptions([ 'timeout' => 30, 'headers' => [ 'User-Agent' => 'Mozilla/5.0 (compatible; Part-DB AI-Extractor/1.0)', ], ]); } public function getProviderInfo(): array { return [ 'name' => 'AI Information Extractor', 'description' => 'Extract part info from any URL using OpenRouter LLM', //'url' => 'https://openrouter.ai', 'disabled_help' => 'Configure OpenRouter API key in settings', 'settings_class' => AIExtractorSettings::class, ]; } public function getProviderKey(): string { return 'ai_extractor'; } public function isActive(): bool { return $this->settings->platform !== null && $this->settings->model !== null && $this->settings->model !== ''; } public function searchByKeyword(string $keyword): array { try { return [ $this->getDetails($keyword) ]; } catch (ProviderIDNotSupportedException $e) { return []; } } public function getDetails(string $id): PartDetailDTO { $url = $this->fixAndValidateURL($id); // Fetch HTML content $response = $this->httpClient->request('GET', $url); $html = $response->getContent(); // Clean HTML /*$cleanedHtml = $this->cleanHTML($html); // Truncate to max content length $truncatedHtml = $this->truncateHTML($cleanedHtml, $this->settings->maxContentLength);*/ //Convert html to markdown, to provide a cleaner input to the LLM. $markdown = $this->htmlToMarkdown($html); //Truncate markdown to max content length, if needed $markdown = u($markdown)->truncate($this->settings->maxContentLength, '... [truncated]')->toString(); //Extract structured data using traditional methods, to provide additional context to the LLM. This can help improve accuracy, especially for technical specifications that might be in tables or specific formats. $structuredData = $this->extractStructuredData($html, $url); // Call LLM $llmResponse = $this->callLLM($markdown, $url, $structuredData); // Build and return PartDetailDTO $result = $this->jsonSchemaConverter->jsonToDTO($llmResponse, $this->getProviderKey(), $url, $url, self::DISTRIBUTOR_NAME); return $result; } /** * Extracts structured data from the HTML using microformats. * @param string $html * @param string $url * @return string JSON encoded structured data */ private function extractStructuredData(string $html, string $url): string { $micrometa = new Micrometa\Ports\Parser(); $items = $micrometa($url, $html); return json_encode($items->toObject(), JSON_THROW_ON_ERROR); } private function htmlToMarkdown(string $html): string { //Extract only the main content of the page to avoid overwhelming the LLM with irrelevant information. $crawler = new Crawler($html); $mainContent = $crawler->filter('main, article, #content'); // If we found a specific content area, get its HTML; otherwise, use the whole body. //Concat the html of all matched nodes, to provide more context to the LLM, especially for pages that use multiple sections for product info. if ($mainContent->count() > 0) { $htmlToConvert = ''; foreach ($mainContent as $node) { $htmlToConvert .= $node->ownerDocument->saveHTML($node); $htmlToConvert .= "\n\n"; // Add some spacing between sections } } else { //Use the whole body content, as it might contain relevant information, especially for simpler pages that don't have a clear main/content section. $htmlToConvert = $html; } //Concert to markdown $converter = new HtmlConverter([ 'strip_tags' => true, // Removes tags that aren't Markdown-compatible (like