diff --git a/src/Services/InfoProviderSystem/DTOJsonSchemaConverter.php b/src/Services/InfoProviderSystem/DTOJsonSchemaConverter.php index 2d297243..759c3d12 100644 --- a/src/Services/InfoProviderSystem/DTOJsonSchemaConverter.php +++ b/src/Services/InfoProviderSystem/DTOJsonSchemaConverter.php @@ -48,14 +48,15 @@ final class DTOJsonSchemaConverter 'type' => 'object', 'properties' => [ 'name' => ['type' => 'string', 'description' => 'Product name'], - 'description' => ['type' => 'string', 'description' => 'Product description'], + 'description' => ['type' => 'string', 'description' => 'A short description of the product, maybe containing the most important things. Onnly One line.'], 'manufacturer' => ['type' => ['string', 'null'], 'description' => 'Manufacturer name'], 'mpn' => ['type' => ['string', 'null'], 'description' => 'Manufacturer Part Number'], - 'category' => ['type' => ['string', 'null'], 'description' => 'Product category'], + 'category' => ['type' => ['string', 'null'], 'description' => 'Product category, e.g. "Passive components -> Resistors"'], 'manufacturing_status' => ['type' => ['string', 'null'], 'enum' => ['active', 'obsolete', 'nrfnd', 'discontinued', null], 'description' => 'Manufacturing status'], - 'footprint' => ['type' => ['string', 'null'], 'description' => 'Package/footprint type'], + 'footprint' => ['type' => ['string', 'null'], 'description' => 'Package/footprint type, like "SOT-23", "DIP-8", "QFN-32" etc.'], 'mass' => ['type' => ['number', 'null'], 'description' => 'Mass of the product in grams'], - 'gtin' => ['type' => ['string', 'null'], 'description' => 'Global Trade Item Number (GTIN) / EAN / UPC code'], + 'gtin' => ['type' => ['string', 'null'], 'description' => 'Global Trade Item Number (GTIN) / EAN / UPC code for barcodes'], + 'notes' => ['type' => ['string', 'null'], 'description' => 'Optional long description of the part with more details than description. Can be markdown formatted.'], 'parameters' => [ 'type' => 'array', 'items' => [ @@ -98,6 +99,7 @@ final class DTOJsonSchemaConverter 'distributor_name' => ['type' => 'string', 'description' => 'Name of the distributor or vendor. Typically the shop name'], 'order_number' => ['type' => ['string', 'null'], 'description' => 'The order number or SKU used by the distributor. Optional, but can help to find the product on the distributor website.'], 'product_url' => ['type' => 'string'], + 'prices_include_vat' => ['type' => ['boolean', 'null'], 'description' => 'Whether the prices include VAT or not. Null if unknown.'], 'prices' => [ 'type' => 'array', 'items' => [ @@ -194,8 +196,8 @@ final class DTOJsonSchemaConverter $prices[] = new PriceDTO( minimum_discount_amount: (int) ($p['minimum_quantity'] ?? 1), price: (string) ($p['price'] ?? 0), - currency_iso_code: $p['currency'] ?? 'USD', - price_related_quantity: (int) ($p['minimum_quantity'] ?? 1), + currency_iso_code: $p['currency'] ?? null, + price_related_quantity: 1, ); } } @@ -205,6 +207,7 @@ final class DTOJsonSchemaConverter order_number: $v['order_number'] ?? 'Unknown', prices: $prices, product_url: $v['product_url'] ?? $productUrl, + prices_include_vat: $v['prices_include_vat'] ?? null, ); } } @@ -228,7 +231,7 @@ final class DTOJsonSchemaConverter provider_url: $productUrl, footprint: $data['footprint'] ?? null, gtin: $data['gtin'] ?? null, - notes: null, + notes: $data['notes'], datasheets: $datasheets, images: $images, parameters: $parameters, diff --git a/src/Services/InfoProviderSystem/Providers/AIInfoExtractor.php b/src/Services/InfoProviderSystem/Providers/AIInfoExtractor.php index 7ae858a6..92ed4e19 100644 --- a/src/Services/InfoProviderSystem/Providers/AIInfoExtractor.php +++ b/src/Services/InfoProviderSystem/Providers/AIInfoExtractor.php @@ -37,6 +37,8 @@ use Symfony\AI\Platform\Message\MessageBag; use Symfony\Component\DomCrawler\Crawler; use Symfony\Contracts\HttpClient\HttpClientInterface; +use function Symfony\Component\String\u; + final class AIInfoExtractor implements InfoProviderInterface { @@ -105,7 +107,10 @@ final class AIInfoExtractor implements InfoProviderInterface // Truncate to max content length $truncatedHtml = $this->truncateHTML($cleanedHtml, $this->settings->maxContentLength);*/ + //Convert html to markdown, to provide a cleaner input to the LLM. $markdown = $this->htmlToMarkdown($html); + //Truncate markdown to max content length, if needed + $markdown = u($markdown)->truncate($this->settings->maxContentLength, '... [truncated]')->toString(); //Extract structured data using traditional methods, to provide additional context to the LLM. This can help improve accuracy, especially for technical specifications that might be in tables or specific formats. $structuredData = $this->extractStructuredData($html, $url); @@ -137,10 +142,21 @@ final class AIInfoExtractor implements InfoProviderInterface { //Extract only the main content of the page to avoid overwhelming the LLM with irrelevant information. $crawler = new Crawler($html); - $mainContent = $crawler->filter('main, article, #content')->first(); + $mainContent = $crawler->filter('main, article, #content'); // If we found a specific content area, get its HTML; otherwise, use the whole body. - $htmlToConvert = $mainContent->count() ? $mainContent->html() : $html; + //Concat the html of all matched nodes, to provide more context to the LLM, especially for pages that use multiple sections for product info. + if ($mainContent->count() > 0) { + $htmlToConvert = ''; + foreach ($mainContent as $node) { + $htmlToConvert .= $node->ownerDocument->saveHTML($node); + $htmlToConvert .= "\n\n"; // Add some spacing between sections + } + } else { + //Use the whole body content, as it might contain relevant information, especially for simpler pages that don't have a clear main/content section. + $htmlToConvert = $html; + } + //Concert to markdown $converter = new HtmlConverter([ @@ -163,48 +179,6 @@ final class AIInfoExtractor implements InfoProviderInterface ]; } - private function cleanHTML(string $html): string - { - // Remove script tags - $html = preg_replace('/]*>(.*?)<\/script>/is', '', $html); - - // Remove style tags - $html = preg_replace('/]*>(.*?)<\/style>/is', '', $html); - - // Remove nav tags - $html = preg_replace('/]*>(.*?)<\/nav>/is', '', $html); - - // Remove footer tags - $html = preg_replace('/]*>(.*?)<\/footer>/is', '', $html); - - // Remove header tags - $html = preg_replace('/]*>(.*?)<\/header>/is', '', $html); - - // Remove HTML comments - $html = preg_replace('//is', '', $html); - - return $html; - } - - private function truncateHTML(string $html, int $maxLength): string - { - if (strlen($html) <= $maxLength) { - return $html; - } - - // Truncate and find the last > or space to avoid cutting tags - $truncated = substr($html, 0, $maxLength); - - // Find the last occurrence of > or space - $lastPos = max(strrpos($truncated, '>'), strrpos($truncated, ' ')); - - if ($lastPos !== false && $lastPos > $maxLength * 0.9) { - $truncated = substr($truncated, 0, $lastPos + 1); - } - - return $truncated; - } - private function callLLM(string $htmlContent, string $url, ?string $structuredData = null): array { $input = new MessageBag(