Improved markdown conversion and add ability to extract notes

This commit is contained in:
Jan Böhmer 2026-04-26 21:31:07 +02:00
parent 4dbd92ac4d
commit 4a45b5d5a9
2 changed files with 28 additions and 51 deletions

View file

@ -48,14 +48,15 @@ final class DTOJsonSchemaConverter
'type' => 'object', 'type' => 'object',
'properties' => [ 'properties' => [
'name' => ['type' => 'string', 'description' => 'Product name'], 'name' => ['type' => 'string', 'description' => 'Product name'],
'description' => ['type' => 'string', 'description' => 'Product description'], 'description' => ['type' => 'string', 'description' => 'A short description of the product, maybe containing the most important things. Onnly One line.'],
'manufacturer' => ['type' => ['string', 'null'], 'description' => 'Manufacturer name'], 'manufacturer' => ['type' => ['string', 'null'], 'description' => 'Manufacturer name'],
'mpn' => ['type' => ['string', 'null'], 'description' => 'Manufacturer Part Number'], 'mpn' => ['type' => ['string', 'null'], 'description' => 'Manufacturer Part Number'],
'category' => ['type' => ['string', 'null'], 'description' => 'Product category'], 'category' => ['type' => ['string', 'null'], 'description' => 'Product category, e.g. "Passive components -> Resistors"'],
'manufacturing_status' => ['type' => ['string', 'null'], 'enum' => ['active', 'obsolete', 'nrfnd', 'discontinued', null], 'description' => 'Manufacturing status'], 'manufacturing_status' => ['type' => ['string', 'null'], 'enum' => ['active', 'obsolete', 'nrfnd', 'discontinued', null], 'description' => 'Manufacturing status'],
'footprint' => ['type' => ['string', 'null'], 'description' => 'Package/footprint type'], 'footprint' => ['type' => ['string', 'null'], 'description' => 'Package/footprint type, like "SOT-23", "DIP-8", "QFN-32" etc.'],
'mass' => ['type' => ['number', 'null'], 'description' => 'Mass of the product in grams'], 'mass' => ['type' => ['number', 'null'], 'description' => 'Mass of the product in grams'],
'gtin' => ['type' => ['string', 'null'], 'description' => 'Global Trade Item Number (GTIN) / EAN / UPC code'], 'gtin' => ['type' => ['string', 'null'], 'description' => 'Global Trade Item Number (GTIN) / EAN / UPC code for barcodes'],
'notes' => ['type' => ['string', 'null'], 'description' => 'Optional long description of the part with more details than description. Can be markdown formatted.'],
'parameters' => [ 'parameters' => [
'type' => 'array', 'type' => 'array',
'items' => [ 'items' => [
@ -98,6 +99,7 @@ final class DTOJsonSchemaConverter
'distributor_name' => ['type' => 'string', 'description' => 'Name of the distributor or vendor. Typically the shop name'], 'distributor_name' => ['type' => 'string', 'description' => 'Name of the distributor or vendor. Typically the shop name'],
'order_number' => ['type' => ['string', 'null'], 'description' => 'The order number or SKU used by the distributor. Optional, but can help to find the product on the distributor website.'], 'order_number' => ['type' => ['string', 'null'], 'description' => 'The order number or SKU used by the distributor. Optional, but can help to find the product on the distributor website.'],
'product_url' => ['type' => 'string'], 'product_url' => ['type' => 'string'],
'prices_include_vat' => ['type' => ['boolean', 'null'], 'description' => 'Whether the prices include VAT or not. Null if unknown.'],
'prices' => [ 'prices' => [
'type' => 'array', 'type' => 'array',
'items' => [ 'items' => [
@ -194,8 +196,8 @@ final class DTOJsonSchemaConverter
$prices[] = new PriceDTO( $prices[] = new PriceDTO(
minimum_discount_amount: (int) ($p['minimum_quantity'] ?? 1), minimum_discount_amount: (int) ($p['minimum_quantity'] ?? 1),
price: (string) ($p['price'] ?? 0), price: (string) ($p['price'] ?? 0),
currency_iso_code: $p['currency'] ?? 'USD', currency_iso_code: $p['currency'] ?? null,
price_related_quantity: (int) ($p['minimum_quantity'] ?? 1), price_related_quantity: 1,
); );
} }
} }
@ -205,6 +207,7 @@ final class DTOJsonSchemaConverter
order_number: $v['order_number'] ?? 'Unknown', order_number: $v['order_number'] ?? 'Unknown',
prices: $prices, prices: $prices,
product_url: $v['product_url'] ?? $productUrl, product_url: $v['product_url'] ?? $productUrl,
prices_include_vat: $v['prices_include_vat'] ?? null,
); );
} }
} }
@ -228,7 +231,7 @@ final class DTOJsonSchemaConverter
provider_url: $productUrl, provider_url: $productUrl,
footprint: $data['footprint'] ?? null, footprint: $data['footprint'] ?? null,
gtin: $data['gtin'] ?? null, gtin: $data['gtin'] ?? null,
notes: null, notes: $data['notes'],
datasheets: $datasheets, datasheets: $datasheets,
images: $images, images: $images,
parameters: $parameters, parameters: $parameters,

View file

@ -37,6 +37,8 @@ use Symfony\AI\Platform\Message\MessageBag;
use Symfony\Component\DomCrawler\Crawler; use Symfony\Component\DomCrawler\Crawler;
use Symfony\Contracts\HttpClient\HttpClientInterface; use Symfony\Contracts\HttpClient\HttpClientInterface;
use function Symfony\Component\String\u;
final class AIInfoExtractor implements InfoProviderInterface final class AIInfoExtractor implements InfoProviderInterface
{ {
@ -105,7 +107,10 @@ final class AIInfoExtractor implements InfoProviderInterface
// Truncate to max content length // Truncate to max content length
$truncatedHtml = $this->truncateHTML($cleanedHtml, $this->settings->maxContentLength);*/ $truncatedHtml = $this->truncateHTML($cleanedHtml, $this->settings->maxContentLength);*/
//Convert html to markdown, to provide a cleaner input to the LLM.
$markdown = $this->htmlToMarkdown($html); $markdown = $this->htmlToMarkdown($html);
//Truncate markdown to max content length, if needed
$markdown = u($markdown)->truncate($this->settings->maxContentLength, '... [truncated]')->toString();
//Extract structured data using traditional methods, to provide additional context to the LLM. This can help improve accuracy, especially for technical specifications that might be in tables or specific formats. //Extract structured data using traditional methods, to provide additional context to the LLM. This can help improve accuracy, especially for technical specifications that might be in tables or specific formats.
$structuredData = $this->extractStructuredData($html, $url); $structuredData = $this->extractStructuredData($html, $url);
@ -137,10 +142,21 @@ final class AIInfoExtractor implements InfoProviderInterface
{ {
//Extract only the main content of the page to avoid overwhelming the LLM with irrelevant information. //Extract only the main content of the page to avoid overwhelming the LLM with irrelevant information.
$crawler = new Crawler($html); $crawler = new Crawler($html);
$mainContent = $crawler->filter('main, article, #content')->first(); $mainContent = $crawler->filter('main, article, #content');
// If we found a specific content area, get its HTML; otherwise, use the whole body. // If we found a specific content area, get its HTML; otherwise, use the whole body.
$htmlToConvert = $mainContent->count() ? $mainContent->html() : $html; //Concat the html of all matched nodes, to provide more context to the LLM, especially for pages that use multiple sections for product info.
if ($mainContent->count() > 0) {
$htmlToConvert = '';
foreach ($mainContent as $node) {
$htmlToConvert .= $node->ownerDocument->saveHTML($node);
$htmlToConvert .= "\n\n"; // Add some spacing between sections
}
} else {
//Use the whole body content, as it might contain relevant information, especially for simpler pages that don't have a clear main/content section.
$htmlToConvert = $html;
}
//Concert to markdown //Concert to markdown
$converter = new HtmlConverter([ $converter = new HtmlConverter([
@ -163,48 +179,6 @@ final class AIInfoExtractor implements InfoProviderInterface
]; ];
} }
private function cleanHTML(string $html): string
{
// Remove script tags
$html = preg_replace('/<script\b[^>]*>(.*?)<\/script>/is', '', $html);
// Remove style tags
$html = preg_replace('/<style\b[^>]*>(.*?)<\/style>/is', '', $html);
// Remove nav tags
$html = preg_replace('/<nav\b[^>]*>(.*?)<\/nav>/is', '', $html);
// Remove footer tags
$html = preg_replace('/<footer\b[^>]*>(.*?)<\/footer>/is', '', $html);
// Remove header tags
$html = preg_replace('/<header\b[^>]*>(.*?)<\/header>/is', '', $html);
// Remove HTML comments
$html = preg_replace('/<!--(.*?)-->/is', '', $html);
return $html;
}
private function truncateHTML(string $html, int $maxLength): string
{
if (strlen($html) <= $maxLength) {
return $html;
}
// Truncate and find the last > or space to avoid cutting tags
$truncated = substr($html, 0, $maxLength);
// Find the last occurrence of > or space
$lastPos = max(strrpos($truncated, '>'), strrpos($truncated, ' '));
if ($lastPos !== false && $lastPos > $maxLength * 0.9) {
$truncated = substr($truncated, 0, $lastPos + 1);
}
return $truncated;
}
private function callLLM(string $htmlContent, string $url, ?string $structuredData = null): array private function callLLM(string $htmlContent, string $url, ?string $structuredData = null): array
{ {
$input = new MessageBag( $input = new MessageBag(