Part-DB-server/src/Services/InfoProviderSystem/Providers/AIInfoExtractor.php

256 lines
9.2 KiB
PHP
Raw Normal View History

2026-03-22 11:59:36 +05:30
<?php
/*
* This file is part of Part-DB (https://github.com/Part-DB/Part-DB-symfony).
*
* Copyright (C) 2019 - 2026 Jan Böhmer (https://github.com/jbtronics)
* Copyright (C) 2026 Rahul Singh (https://github.com/rahools)
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
declare(strict_types=1);
namespace App\Services\InfoProviderSystem\Providers;
use App\Exceptions\ProviderIDNotSupportedException;
use App\Services\AI\AIPlatformRegistry;
use App\Services\InfoProviderSystem\DTOJsonSchemaConverter;
2026-03-22 11:59:36 +05:30
use App\Services\InfoProviderSystem\DTOs\PartDetailDTO;
use App\Settings\InfoProviderSystem\AIExtractorSettings;
use Brick\Schema\SchemaReader;
use Jkphl\Micrometa;
use League\HTMLToMarkdown\HtmlConverter;
use Symfony\AI\Platform\Message\Message;
use Symfony\AI\Platform\Message\MessageBag;
use Symfony\Component\DomCrawler\Crawler;
2026-03-22 11:59:36 +05:30
use Symfony\Contracts\HttpClient\HttpClientInterface;
final class AIInfoExtractor implements InfoProviderInterface
2026-03-22 11:59:36 +05:30
{
use FixAndValidateUrlTrait;
2026-03-22 11:59:36 +05:30
private const DISTRIBUTOR_NAME = 'AI Extracted';
private readonly HttpClientInterface $httpClient;
public function __construct(
HttpClientInterface $httpClient,
private readonly AIExtractorSettings $settings,
private readonly AIPlatformRegistry $AIPlatformRegistry,
private readonly DTOJsonSchemaConverter $jsonSchemaConverter,
) {
2026-03-22 11:59:36 +05:30
$this->httpClient = $httpClient->withOptions([
'timeout' => 30,
'headers' => [
'User-Agent' => 'Mozilla/5.0 (compatible; Part-DB AI-Extractor/1.0)',
],
]);
}
public function getProviderInfo(): array
{
return [
'name' => 'AI Information Extractor',
'description' => 'Extract part info from any URL using OpenRouter LLM',
2026-04-26 15:48:17 +02:00
//'url' => 'https://openrouter.ai',
2026-03-22 11:59:36 +05:30
'disabled_help' => 'Configure OpenRouter API key in settings',
'settings_class' => AIExtractorSettings::class,
];
}
public function getProviderKey(): string
{
return 'ai_extractor';
}
public function isActive(): bool
{
2026-04-26 15:48:17 +02:00
return $this->settings->platform !== null && $this->settings->model !== null && $this->settings->model !== '';
2026-03-22 11:59:36 +05:30
}
public function searchByKeyword(string $keyword): array
{
try {
return [
$this->getDetails($keyword)
]; } catch (ProviderIDNotSupportedException $e) {
return [];
}
2026-03-22 11:59:36 +05:30
}
public function getDetails(string $id): PartDetailDTO
{
$url = $this->fixAndValidateURL($id);
2026-03-22 11:59:36 +05:30
// Fetch HTML content
$response = $this->httpClient->request('GET', $url);
$html = $response->getContent();
// Clean HTML
/*$cleanedHtml = $this->cleanHTML($html);
2026-03-22 11:59:36 +05:30
// Truncate to max content length
$truncatedHtml = $this->truncateHTML($cleanedHtml, $this->settings->maxContentLength);*/
$markdown = $this->htmlToMarkdown($html);
//Extract structured data using traditional methods, to provide additional context to the LLM. This can help improve accuracy, especially for technical specifications that might be in tables or specific formats.
$structuredData = $this->extractStructuredData($html, $url);
2026-03-22 11:59:36 +05:30
// Call LLM
$llmResponse = $this->callLLM($markdown, $url, $structuredData);
2026-03-22 11:59:36 +05:30
// Build and return PartDetailDTO
$result = $this->jsonSchemaConverter->jsonToDTO($llmResponse, $this->getProviderKey(), $url, $url, self::DISTRIBUTOR_NAME);
return $result;
}
/**
* Extracts structured data from the HTML using microformats.
* @param string $html
* @param string $url
* @return string JSON encoded structured data
*/
private function extractStructuredData(string $html, string $url): string
{
$micrometa = new Micrometa\Ports\Parser();
$items = $micrometa($url, $html);
return json_encode($items->toObject(), JSON_THROW_ON_ERROR);
}
private function htmlToMarkdown(string $html): string
{
//Extract only the main content of the page to avoid overwhelming the LLM with irrelevant information.
$crawler = new Crawler($html);
$mainContent = $crawler->filter('main, article, #content')->first();
// If we found a specific content area, get its HTML; otherwise, use the whole body.
$htmlToConvert = $mainContent->count() ? $mainContent->html() : $html;
//Concert to markdown
$converter = new HtmlConverter([
'strip_tags' => true, // Removes tags that aren't Markdown-compatible (like <div>)
'hard_break' => true, // Preserves line breaks
'remove_nodes' => 'nav footer script style' // Extra safety layer
]);
return $converter->convert($htmlToConvert);
2026-03-22 11:59:36 +05:30
}
public function getCapabilities(): array
{
return [
ProviderCapabilities::BASIC,
ProviderCapabilities::PICTURE,
ProviderCapabilities::DATASHEET,
ProviderCapabilities::PRICE,
ProviderCapabilities::PARAMETERS,
];
}
private function cleanHTML(string $html): string
{
// Remove script tags
$html = preg_replace('/<script\b[^>]*>(.*?)<\/script>/is', '', $html);
// Remove style tags
$html = preg_replace('/<style\b[^>]*>(.*?)<\/style>/is', '', $html);
// Remove nav tags
$html = preg_replace('/<nav\b[^>]*>(.*?)<\/nav>/is', '', $html);
// Remove footer tags
$html = preg_replace('/<footer\b[^>]*>(.*?)<\/footer>/is', '', $html);
// Remove header tags
$html = preg_replace('/<header\b[^>]*>(.*?)<\/header>/is', '', $html);
// Remove HTML comments
$html = preg_replace('/<!--(.*?)-->/is', '', $html);
return $html;
}
private function truncateHTML(string $html, int $maxLength): string
{
if (strlen($html) <= $maxLength) {
return $html;
}
// Truncate and find the last > or space to avoid cutting tags
$truncated = substr($html, 0, $maxLength);
// Find the last occurrence of > or space
$lastPos = max(strrpos($truncated, '>'), strrpos($truncated, ' '));
if ($lastPos !== false && $lastPos > $maxLength * 0.9) {
$truncated = substr($truncated, 0, $lastPos + 1);
}
return $truncated;
}
private function callLLM(string $htmlContent, string $url, ?string $structuredData = null): array
2026-03-22 11:59:36 +05:30
{
$input = new MessageBag(
Message::forSystem($this->buildSystemPrompt()),
Message::ofUser("Extract part information from this webpage content:\n\nURL: $url\n\n$htmlContent")
);
if ($structuredData) {
$input->add(Message::ofUser("Following data was extracted using traditional methods, but might be incomplete or inaccurate.
Enrich it with the actual website data:\n\n".$structuredData));
}
try {
$aiPlatform = $this->AIPlatformRegistry->getPlatform($this->settings->platform ?? throw new \RuntimeException('No AI platform selected') );
//'openai/gpt-5-mini'
2026-04-26 15:48:17 +02:00
$result = $aiPlatform->invoke($this->settings->model ?? throw new \RuntimeException('No model selected'), $input, [
'response_format' => [
'type' => 'json_schema',
'json_schema' => $this->jsonSchemaConverter->getJSONSchema(),
]
]);
} catch (\Throwable $e) {
throw new \RuntimeException('LLM invocation failed: '.$e->getMessage(), previous: $e);
}
return $result->getResult()->getContent();
2026-03-22 11:59:36 +05:30
}
private function buildSystemPrompt(): string
{
return <<<'PROMPT'
You are an expert at extracting electronic component information from web pages. Extract structured data in JSON format, from markdown extracted from a product page.
Focus on the main content of the page, such as product descriptions, specifications, and tables. Ignore navigation menus, footers, and sidebars.
2026-03-22 11:59:36 +05:30
Rules:
- manufacturing_status: Use "active", "obsolete", "nrfnd" (not recommended for new designs), "discontinued", or null
- parameters: Extract technical specs like voltage, current, temperature, etc.
- prices: Extract pricing tiers with minimum_quantity, price, and currency code
- URLs must be absolute (include https://...)
- If information is not found, use null
- Return ONLY the JSON, no explanation text
For parameters, combine name, value, and unit. The unit should be separate if possible.
PROMPT;
}
}