. */ declare(strict_types=1); namespace App\Services\InfoProviderSystem\Providers; use App\Exceptions\ProviderIDNotSupportedException; use App\Services\AI\AIPlatformRegistry; use App\Services\InfoProviderSystem\DTOJsonSchemaConverter; use App\Services\InfoProviderSystem\DTOs\PartDetailDTO; use App\Settings\InfoProviderSystem\AIExtractorSettings; use Brick\Schema\SchemaReader; use Jkphl\Micrometa; use League\HTMLToMarkdown\HtmlConverter; use Symfony\AI\Platform\Message\Message; use Symfony\AI\Platform\Message\MessageBag; use Symfony\Component\DomCrawler\Crawler; use Symfony\Contracts\HttpClient\HttpClientInterface; final class AIInfoExtractor implements InfoProviderInterface { use FixAndValidateUrlTrait; private const DISTRIBUTOR_NAME = 'AI Extracted'; private readonly HttpClientInterface $httpClient; public function __construct( HttpClientInterface $httpClient, private readonly AIExtractorSettings $settings, private readonly AIPlatformRegistry $AIPlatformRegistry, private readonly DTOJsonSchemaConverter $jsonSchemaConverter, ) { $this->httpClient = $httpClient->withOptions([ 'timeout' => 30, 'headers' => [ 'User-Agent' => 'Mozilla/5.0 (compatible; Part-DB AI-Extractor/1.0)', ], ]); } public function getProviderInfo(): array { return [ 'name' => 'AI Information Extractor', 'description' => 'Extract part info from any URL using OpenRouter LLM', //'url' => 'https://openrouter.ai', 'disabled_help' => 'Configure OpenRouter API key in settings', 'settings_class' => AIExtractorSettings::class, ]; } public function getProviderKey(): string { return 'ai_extractor'; } public function isActive(): bool { return $this->settings->platform !== null && $this->settings->model !== null && $this->settings->model !== ''; } public function searchByKeyword(string $keyword): array { try { return [ $this->getDetails($keyword) ]; } catch (ProviderIDNotSupportedException $e) { return []; } } public function getDetails(string $id): PartDetailDTO { $url = $this->fixAndValidateURL($id); // Fetch HTML content $response = $this->httpClient->request('GET', $url); $html = $response->getContent(); // Clean HTML /*$cleanedHtml = $this->cleanHTML($html); // Truncate to max content length $truncatedHtml = $this->truncateHTML($cleanedHtml, $this->settings->maxContentLength);*/ $markdown = $this->htmlToMarkdown($html); //Extract structured data using traditional methods, to provide additional context to the LLM. This can help improve accuracy, especially for technical specifications that might be in tables or specific formats. $structuredData = $this->extractStructuredData($html, $url); // Call LLM $llmResponse = $this->callLLM($markdown, $url, $structuredData); // Build and return PartDetailDTO $result = $this->jsonSchemaConverter->jsonToDTO($llmResponse, $this->getProviderKey(), $url, $url, self::DISTRIBUTOR_NAME); return $result; } /** * Extracts structured data from the HTML using microformats. * @param string $html * @param string $url * @return string JSON encoded structured data */ private function extractStructuredData(string $html, string $url): string { $micrometa = new Micrometa\Ports\Parser(); $items = $micrometa($url, $html); return json_encode($items->toObject(), JSON_THROW_ON_ERROR); } private function htmlToMarkdown(string $html): string { //Extract only the main content of the page to avoid overwhelming the LLM with irrelevant information. $crawler = new Crawler($html); $mainContent = $crawler->filter('main, article, #content')->first(); // If we found a specific content area, get its HTML; otherwise, use the whole body. $htmlToConvert = $mainContent->count() ? $mainContent->html() : $html; //Concert to markdown $converter = new HtmlConverter([ 'strip_tags' => true, // Removes tags that aren't Markdown-compatible (like
) 'hard_break' => true, // Preserves line breaks 'remove_nodes' => 'nav footer script style' // Extra safety layer ]); return $converter->convert($htmlToConvert); } public function getCapabilities(): array { return [ ProviderCapabilities::BASIC, ProviderCapabilities::PICTURE, ProviderCapabilities::DATASHEET, ProviderCapabilities::PRICE, ProviderCapabilities::PARAMETERS, ]; } private function cleanHTML(string $html): string { // Remove script tags $html = preg_replace('/]*>(.*?)<\/script>/is', '', $html); // Remove style tags $html = preg_replace('/]*>(.*?)<\/style>/is', '', $html); // Remove nav tags $html = preg_replace('/]*>(.*?)<\/nav>/is', '', $html); // Remove footer tags $html = preg_replace('/]*>(.*?)<\/footer>/is', '', $html); // Remove header tags $html = preg_replace('/]*>(.*?)<\/header>/is', '', $html); // Remove HTML comments $html = preg_replace('//is', '', $html); return $html; } private function truncateHTML(string $html, int $maxLength): string { if (strlen($html) <= $maxLength) { return $html; } // Truncate and find the last > or space to avoid cutting tags $truncated = substr($html, 0, $maxLength); // Find the last occurrence of > or space $lastPos = max(strrpos($truncated, '>'), strrpos($truncated, ' ')); if ($lastPos !== false && $lastPos > $maxLength * 0.9) { $truncated = substr($truncated, 0, $lastPos + 1); } return $truncated; } private function callLLM(string $htmlContent, string $url, ?string $structuredData = null): array { $input = new MessageBag( Message::forSystem($this->buildSystemPrompt()), Message::ofUser("Extract part information from this webpage content:\n\nURL: $url\n\n$htmlContent") ); if ($structuredData) { $input->add(Message::ofUser("Following data was extracted using traditional methods, but might be incomplete or inaccurate. Enrich it with the actual website data:\n\n".$structuredData)); } try { $aiPlatform = $this->AIPlatformRegistry->getPlatform($this->settings->platform ?? throw new \RuntimeException('No AI platform selected') ); //'openai/gpt-5-mini' $result = $aiPlatform->invoke($this->settings->model ?? throw new \RuntimeException('No model selected'), $input, [ 'response_format' => [ 'type' => 'json_schema', 'json_schema' => $this->jsonSchemaConverter->getJSONSchema(), ] ]); } catch (\Throwable $e) { throw new \RuntimeException('LLM invocation failed: '.$e->getMessage(), previous: $e); } return $result->getResult()->getContent(); } private function buildSystemPrompt(): string { return <<<'PROMPT' You are an expert at extracting electronic component information from web pages. Extract structured data in JSON format, from markdown extracted from a product page. Focus on the main content of the page, such as product descriptions, specifications, and tables. Ignore navigation menus, footers, and sidebars. Rules: - manufacturing_status: Use "active", "obsolete", "nrfnd" (not recommended for new designs), "discontinued", or null - parameters: Extract technical specs like voltage, current, temperature, etc. - prices: Extract pricing tiers with minimum_quantity, price, and currency code - URLs must be absolute (include https://...) - If information is not found, use null - Return ONLY the JSON, no explanation text For parameters, combine name, value, and unit. The unit should be separate if possible. PROMPT; } }