Use markdown as input for the LLM and add extracted microdata separatley

This commit is contained in:
Jan Böhmer 2026-04-26 19:36:03 +02:00
parent af98fc1079
commit 4dbd92ac4d
5 changed files with 481 additions and 97 deletions

View file

@ -54,7 +54,8 @@ final class DTOJsonSchemaConverter
'category' => ['type' => ['string', 'null'], 'description' => 'Product category'],
'manufacturing_status' => ['type' => ['string', 'null'], 'enum' => ['active', 'obsolete', 'nrfnd', 'discontinued', null], 'description' => 'Manufacturing status'],
'footprint' => ['type' => ['string', 'null'], 'description' => 'Package/footprint type'],
'mass' => ['type' => ['number', 'null'], 'description' => 'Mass in grams'],
'mass' => ['type' => ['number', 'null'], 'description' => 'Mass of the product in grams'],
'gtin' => ['type' => ['string', 'null'], 'description' => 'Global Trade Item Number (GTIN) / EAN / UPC code'],
'parameters' => [
'type' => 'array',
'items' => [
@ -94,17 +95,17 @@ final class DTOJsonSchemaConverter
'items' => [
'type' => 'object',
'properties' => [
'distributor_name' => ['type' => 'string'],
'order_number' => ['type' => ['string', 'null']],
'distributor_name' => ['type' => 'string', 'description' => 'Name of the distributor or vendor. Typically the shop name'],
'order_number' => ['type' => ['string', 'null'], 'description' => 'The order number or SKU used by the distributor. Optional, but can help to find the product on the distributor website.'],
'product_url' => ['type' => 'string'],
'prices' => [
'type' => 'array',
'items' => [
'type' => 'object',
'properties' => [
'minimum_quantity' => ['type' => 'integer'],
'price' => ['type' => 'number'],
'currency' => ['type' => 'string'],
'minimum_quantity' => ['type' => 'integer', 'description' => 'Minimum quantity for this price tier. 1 when no tiered pricing is available.'],
'price' => ['type' => 'number', 'description' => 'Price for the given minimum quantity.'],
'currency' => ['type' => 'string', 'description' => 'Currency ISO code, e.g. USD'],
],
'required' => ['minimum_quantity', 'price', 'currency'],
],
@ -226,6 +227,7 @@ final class DTOJsonSchemaConverter
manufacturing_status: $manufacturingStatus,
provider_url: $productUrl,
footprint: $data['footprint'] ?? null,
gtin: $data['gtin'] ?? null,
notes: null,
datasheets: $datasheets,
images: $images,

View file

@ -29,10 +29,15 @@ use App\Services\AI\AIPlatformRegistry;
use App\Services\InfoProviderSystem\DTOJsonSchemaConverter;
use App\Services\InfoProviderSystem\DTOs\PartDetailDTO;
use App\Settings\InfoProviderSystem\AIExtractorSettings;
use Brick\Schema\SchemaReader;
use Jkphl\Micrometa;
use League\HTMLToMarkdown\HtmlConverter;
use Symfony\AI\Platform\Message\Message;
use Symfony\AI\Platform\Message\MessageBag;
use Symfony\Component\DomCrawler\Crawler;
use Symfony\Contracts\HttpClient\HttpClientInterface;
final class AIInfoExtractor implements InfoProviderInterface
{
use FixAndValidateUrlTrait;
@ -95,16 +100,56 @@ final class AIInfoExtractor implements InfoProviderInterface
$html = $response->getContent();
// Clean HTML
$cleanedHtml = $this->cleanHTML($html);
/*$cleanedHtml = $this->cleanHTML($html);
// Truncate to max content length
$truncatedHtml = $this->truncateHTML($cleanedHtml, $this->settings->maxContentLength);
$truncatedHtml = $this->truncateHTML($cleanedHtml, $this->settings->maxContentLength);*/
$markdown = $this->htmlToMarkdown($html);
//Extract structured data using traditional methods, to provide additional context to the LLM. This can help improve accuracy, especially for technical specifications that might be in tables or specific formats.
$structuredData = $this->extractStructuredData($html, $url);
// Call LLM
$llmResponse = $this->callLLM($truncatedHtml, $url);
$llmResponse = $this->callLLM($markdown, $url, $structuredData);
// Build and return PartDetailDTO
return $this->jsonSchemaConverter->jsonToDTO($llmResponse, $this->getProviderKey(), $url, $url, self::DISTRIBUTOR_NAME);
$result = $this->jsonSchemaConverter->jsonToDTO($llmResponse, $this->getProviderKey(), $url, $url, self::DISTRIBUTOR_NAME);
return $result;
}
/**
* Extracts structured data from the HTML using microformats.
* @param string $html
* @param string $url
* @return string JSON encoded structured data
*/
private function extractStructuredData(string $html, string $url): string
{
$micrometa = new Micrometa\Ports\Parser();
$items = $micrometa($url, $html);
return json_encode($items->toObject(), JSON_THROW_ON_ERROR);
}
private function htmlToMarkdown(string $html): string
{
//Extract only the main content of the page to avoid overwhelming the LLM with irrelevant information.
$crawler = new Crawler($html);
$mainContent = $crawler->filter('main, article, #content')->first();
// If we found a specific content area, get its HTML; otherwise, use the whole body.
$htmlToConvert = $mainContent->count() ? $mainContent->html() : $html;
//Concert to markdown
$converter = new HtmlConverter([
'strip_tags' => true, // Removes tags that aren't Markdown-compatible (like <div>)
'hard_break' => true, // Preserves line breaks
'remove_nodes' => 'nav footer script style' // Extra safety layer
]);
return $converter->convert($htmlToConvert);
}
public function getCapabilities(): array
@ -160,13 +205,18 @@ final class AIInfoExtractor implements InfoProviderInterface
return $truncated;
}
private function callLLM(string $htmlContent, string $url): array
private function callLLM(string $htmlContent, string $url, ?string $structuredData = null): array
{
$input = new MessageBag(
Message::forSystem($this->buildSystemPrompt()),
Message::ofUser("Extract part information from this webpage content:\n\nURL: $url\n\n$htmlContent")
);
if ($structuredData) {
$input->add(Message::ofUser("Following data was extracted using traditional methods, but might be incomplete or inaccurate.
Enrich it with the actual website data:\n\n".$structuredData));
}
try {
$aiPlatform = $this->AIPlatformRegistry->getPlatform($this->settings->platform ?? throw new \RuntimeException('No AI platform selected') );
@ -187,29 +237,8 @@ final class AIInfoExtractor implements InfoProviderInterface
private function buildSystemPrompt(): string
{
return <<<'PROMPT'
You are an expert at extracting electronic component information from web pages. Extract structured data in JSON format.
Return ONLY a valid JSON object with this exact structure:
{
"name": "string",
"description": "string",
"manufacturer": "string | null",
"mpn": "string | null",
"category": "string | null",
"manufacturing_status": "active|obsolete|nrfnd|discontinued|null",
"footprint": "string | null",
"mass": "number | null (in grams)",
"parameters": [{"name": "string", "value": "string", "unit": "string | null"}],
"datasheets": [{"url": "string", "description": "string"}],
"images": [{"url": "string", "description": "string"}],
"vendor_infos": [{
"distributor_name": "string",
"order_number": "string | null",
"product_url": "string",
"prices": [{"minimum_quantity": int, "price": number, "currency": "string"}]
}],
"manufacturer_product_url": "string | null"
}
You are an expert at extracting electronic component information from web pages. Extract structured data in JSON format, from markdown extracted from a product page.
Focus on the main content of the page, such as product descriptions, specifications, and tables. Ignore navigation menus, footers, and sidebars.
Rules:
- manufacturing_status: Use "active", "obsolete", "nrfnd" (not recommended for new designs), "discontinued", or null