2026-03-22 11:59:36 +05:30
< ? php
/*
* This file is part of Part - DB ( https :// github . com / Part - DB / Part - DB - symfony ) .
*
* Copyright ( C ) 2019 - 2026 Jan Böhmer ( https :// github . com / jbtronics )
* Copyright ( C ) 2026 Rahul Singh ( https :// github . com / rahools )
*
* This program is free software : you can redistribute it and / or modify
* it under the terms of the GNU Affero General Public License as published
* by the Free Software Foundation , either version 3 of the License , or
* ( at your option ) any later version .
*
* This program is distributed in the hope that it will be useful ,
* but WITHOUT ANY WARRANTY ; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the
* GNU Affero General Public License for more details .
*
* You should have received a copy of the GNU Affero General Public License
* along with this program . If not , see < https :// www . gnu . org / licenses />.
*/
declare ( strict_types = 1 );
namespace App\Services\InfoProviderSystem\Providers ;
2026-04-25 22:21:06 +02:00
use App\Exceptions\ProviderIDNotSupportedException ;
2026-04-25 23:29:22 +02:00
use App\Services\AI\AIPlatformRegistry ;
2026-04-25 22:21:06 +02:00
use App\Services\InfoProviderSystem\DTOJsonSchemaConverter ;
2026-03-22 11:59:36 +05:30
use App\Services\InfoProviderSystem\DTOs\PartDetailDTO ;
use App\Settings\InfoProviderSystem\AIExtractorSettings ;
2026-04-26 19:36:03 +02:00
use Brick\Schema\SchemaReader ;
use Jkphl\Micrometa ;
use League\HTMLToMarkdown\HtmlConverter ;
2026-04-23 23:26:23 +02:00
use Symfony\AI\Platform\Message\Message ;
use Symfony\AI\Platform\Message\MessageBag ;
2026-04-26 19:36:03 +02:00
use Symfony\Component\DomCrawler\Crawler ;
2026-03-22 11:59:36 +05:30
use Symfony\Contracts\HttpClient\HttpClientInterface ;
2026-04-26 19:36:03 +02:00
2026-04-25 22:21:06 +02:00
final class AIInfoExtractor implements InfoProviderInterface
2026-03-22 11:59:36 +05:30
{
2026-04-25 22:21:06 +02:00
use FixAndValidateUrlTrait ;
2026-03-22 11:59:36 +05:30
private const DISTRIBUTOR_NAME = 'AI Extracted' ;
private readonly HttpClientInterface $httpClient ;
2026-04-25 22:21:06 +02:00
public function __construct (
HttpClientInterface $httpClient ,
private readonly AIExtractorSettings $settings ,
2026-04-25 23:29:22 +02:00
private readonly AIPlatformRegistry $AIPlatformRegistry ,
2026-04-25 22:21:06 +02:00
private readonly DTOJsonSchemaConverter $jsonSchemaConverter ,
) {
2026-03-22 11:59:36 +05:30
$this -> httpClient = $httpClient -> withOptions ([
'timeout' => 30 ,
'headers' => [
'User-Agent' => 'Mozilla/5.0 (compatible; Part-DB AI-Extractor/1.0)' ,
],
]);
}
public function getProviderInfo () : array
{
return [
'name' => 'AI Information Extractor' ,
'description' => 'Extract part info from any URL using OpenRouter LLM' ,
2026-04-26 15:48:17 +02:00
//'url' => 'https://openrouter.ai',
2026-03-22 11:59:36 +05:30
'disabled_help' => 'Configure OpenRouter API key in settings' ,
'settings_class' => AIExtractorSettings :: class ,
];
}
public function getProviderKey () : string
{
return 'ai_extractor' ;
}
public function isActive () : bool
{
2026-04-26 15:48:17 +02:00
return $this -> settings -> platform !== null && $this -> settings -> model !== null && $this -> settings -> model !== '' ;
2026-03-22 11:59:36 +05:30
}
public function searchByKeyword ( string $keyword ) : array
{
2026-04-25 22:21:06 +02:00
try {
return [
$this -> getDetails ( $keyword )
]; } catch ( ProviderIDNotSupportedException $e ) {
return [];
}
2026-03-22 11:59:36 +05:30
}
public function getDetails ( string $id ) : PartDetailDTO
{
2026-04-25 22:21:06 +02:00
$url = $this -> fixAndValidateURL ( $id );
2026-03-22 11:59:36 +05:30
// Fetch HTML content
$response = $this -> httpClient -> request ( 'GET' , $url );
$html = $response -> getContent ();
// Clean HTML
2026-04-26 19:36:03 +02:00
/* $cleanedHtml = $this -> cleanHTML ( $html );
2026-03-22 11:59:36 +05:30
// Truncate to max content length
2026-04-26 19:36:03 +02:00
$truncatedHtml = $this -> truncateHTML ( $cleanedHtml , $this -> settings -> maxContentLength ); */
$markdown = $this -> htmlToMarkdown ( $html );
//Extract structured data using traditional methods, to provide additional context to the LLM. This can help improve accuracy, especially for technical specifications that might be in tables or specific formats.
$structuredData = $this -> extractStructuredData ( $html , $url );
2026-03-22 11:59:36 +05:30
2026-04-25 22:21:06 +02:00
// Call LLM
2026-04-26 19:36:03 +02:00
$llmResponse = $this -> callLLM ( $markdown , $url , $structuredData );
2026-03-22 11:59:36 +05:30
// Build and return PartDetailDTO
2026-04-26 19:36:03 +02:00
$result = $this -> jsonSchemaConverter -> jsonToDTO ( $llmResponse , $this -> getProviderKey (), $url , $url , self :: DISTRIBUTOR_NAME );
return $result ;
}
/**
* Extracts structured data from the HTML using microformats .
* @ param string $html
* @ param string $url
* @ return string JSON encoded structured data
*/
private function extractStructuredData ( string $html , string $url ) : string
{
$micrometa = new Micrometa\Ports\Parser ();
$items = $micrometa ( $url , $html );
return json_encode ( $items -> toObject (), JSON_THROW_ON_ERROR );
}
private function htmlToMarkdown ( string $html ) : string
{
//Extract only the main content of the page to avoid overwhelming the LLM with irrelevant information.
$crawler = new Crawler ( $html );
$mainContent = $crawler -> filter ( 'main, article, #content' ) -> first ();
// If we found a specific content area, get its HTML; otherwise, use the whole body.
$htmlToConvert = $mainContent -> count () ? $mainContent -> html () : $html ;
//Concert to markdown
$converter = new HtmlConverter ([
'strip_tags' => true , // Removes tags that aren't Markdown-compatible (like <div>)
'hard_break' => true , // Preserves line breaks
'remove_nodes' => 'nav footer script style' // Extra safety layer
]);
return $converter -> convert ( $htmlToConvert );
2026-03-22 11:59:36 +05:30
}
public function getCapabilities () : array
{
return [
ProviderCapabilities :: BASIC ,
ProviderCapabilities :: PICTURE ,
ProviderCapabilities :: DATASHEET ,
ProviderCapabilities :: PRICE ,
ProviderCapabilities :: PARAMETERS ,
];
}
private function cleanHTML ( string $html ) : string
{
// Remove script tags
$html = preg_replace ( '/<script\b[^>]*>(.*?)<\/script>/is' , '' , $html );
// Remove style tags
$html = preg_replace ( '/<style\b[^>]*>(.*?)<\/style>/is' , '' , $html );
// Remove nav tags
$html = preg_replace ( '/<nav\b[^>]*>(.*?)<\/nav>/is' , '' , $html );
// Remove footer tags
$html = preg_replace ( '/<footer\b[^>]*>(.*?)<\/footer>/is' , '' , $html );
// Remove header tags
$html = preg_replace ( '/<header\b[^>]*>(.*?)<\/header>/is' , '' , $html );
// Remove HTML comments
$html = preg_replace ( '/<!--(.*?)-->/is' , '' , $html );
return $html ;
}
private function truncateHTML ( string $html , int $maxLength ) : string
{
if ( strlen ( $html ) <= $maxLength ) {
return $html ;
}
// Truncate and find the last > or space to avoid cutting tags
$truncated = substr ( $html , 0 , $maxLength );
// Find the last occurrence of > or space
$lastPos = max ( strrpos ( $truncated , '>' ), strrpos ( $truncated , ' ' ));
if ( $lastPos !== false && $lastPos > $maxLength * 0.9 ) {
$truncated = substr ( $truncated , 0 , $lastPos + 1 );
}
return $truncated ;
}
2026-04-26 19:36:03 +02:00
private function callLLM ( string $htmlContent , string $url , ? string $structuredData = null ) : array
2026-03-22 11:59:36 +05:30
{
2026-04-23 23:26:23 +02:00
$input = new MessageBag (
Message :: forSystem ( $this -> buildSystemPrompt ()),
Message :: ofUser ( " Extract part information from this webpage content: \n \n URL: $url\n\n $htmlContent " )
);
2026-04-26 19:36:03 +02:00
if ( $structuredData ) {
$input -> add ( Message :: ofUser ( " Following data was extracted using traditional methods, but might be incomplete or inaccurate.
Enrich it with the actual website data : \n\n " . $structuredData ));
}
2026-04-23 23:26:23 +02:00
try {
2026-04-26 01:10:00 +02:00
$aiPlatform = $this -> AIPlatformRegistry -> getPlatform ( $this -> settings -> platform ? ? throw new \RuntimeException ( 'No AI platform selected' ) );
2026-04-25 23:29:22 +02:00
2026-04-23 23:26:23 +02:00
//'openai/gpt-5-mini'
2026-04-26 15:48:17 +02:00
$result = $aiPlatform -> invoke ( $this -> settings -> model ? ? throw new \RuntimeException ( 'No model selected' ), $input , [
2026-04-26 00:40:17 +02:00
'response_format' => [
'type' => 'json_schema' ,
'json_schema' => $this -> jsonSchemaConverter -> getJSONSchema (),
]
2026-04-23 23:26:23 +02:00
]);
} catch ( \Throwable $e ) {
2026-04-25 22:21:06 +02:00
throw new \RuntimeException ( 'LLM invocation failed: ' . $e -> getMessage (), previous : $e );
2026-04-23 23:26:23 +02:00
}
2026-04-25 22:21:06 +02:00
return $result -> getResult () -> getContent ();
2026-03-22 11:59:36 +05:30
}
private function buildSystemPrompt () : string
{
return <<< 'PROMPT'
2026-04-26 19:36:03 +02:00
You are an expert at extracting electronic component information from web pages . Extract structured data in JSON format , from markdown extracted from a product page .
Focus on the main content of the page , such as product descriptions , specifications , and tables . Ignore navigation menus , footers , and sidebars .
2026-03-22 11:59:36 +05:30
Rules :
- manufacturing_status : Use " active " , " obsolete " , " nrfnd " ( not recommended for new designs ), " discontinued " , or null
- parameters : Extract technical specs like voltage , current , temperature , etc .
- prices : Extract pricing tiers with minimum_quantity , price , and currency code
- URLs must be absolute ( include https ://... )
- If information is not found , use null
- Return ONLY the JSON , no explanation text
For parameters , combine name , value , and unit . The unit should be separate if possible .
PROMPT ;
}
}