Part-DB-server/src/Services/InfoProviderSystem/Providers/AIWebProvider.php

<?php
/*
 * This file is part of Part-DB (https://github.com/Part-DB/Part-DB-symfony).
 *
 *  Copyright (C) 2019 - 2026 Jan Böhmer (https://github.com/jbtronics)
 *  Copyright (C) 2026 Rahul Singh (https://github.com/rahools)
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU Affero General Public License as published
 *  by the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU Affero General Public License for more details.
 *
 *  You should have received a copy of the GNU Affero General Public License
 *  along with this program.  If not, see <https://www.gnu.org/licenses/>.
 */

declare(strict_types=1);


namespace App\Services\InfoProviderSystem\Providers;

use App\Exceptions\ProviderIDNotSupportedException;
use App\Helpers\RandomizeUseragentHttpClient;
use App\Services\AI\AIPlatformRegistry;
use App\Services\InfoProviderSystem\SubmittedPageStorage;
use App\Services\InfoProviderSystem\CreateFromUrlHelper;
use App\Services\InfoProviderSystem\DTOJsonSchemaConverter;
use App\Services\InfoProviderSystem\DTOs\PartDetailDTO;
use App\Settings\InfoProviderSystem\AIExtractorSettings;
use Jkphl\Micrometa;
use League\HTMLToMarkdown\HtmlConverter;
use Psr\Cache\CacheItemPoolInterface;
use Symfony\AI\Platform\Message\Message;
use Symfony\AI\Platform\Message\MessageBag;
use Symfony\Component\DomCrawler\Crawler;
use Symfony\Component\DomCrawler\UriResolver;
use Symfony\Component\HttpClient\NoPrivateNetworkHttpClient;
use Symfony\Component\Intl\Languages;
use Symfony\Contracts\HttpClient\HttpClientInterface;

use function Symfony\Component\String\u;


final class AIWebProvider implements InfoProviderInterface
{
    use FixAndValidateUrlTrait;

    private const DISTRIBUTOR_NAME = 'Website';

    private readonly HttpClientInterface $httpClient;

    public function __construct(
        HttpClientInterface $httpClient,
        private readonly AIExtractorSettings $settings,
        private readonly AIPlatformRegistry $AIPlatformRegistry,
        private readonly DTOJsonSchemaConverter $jsonSchemaConverter,
        private readonly CacheItemPoolInterface $partInfoCache,
        private readonly CreateFromUrlHelper $createFromUrlHelper,
        private readonly SubmittedPageStorage $browserHtmlStorage,
    ) {
        //Use NoPrivateNetworkHttpClient to prevent SSRF vulnerabilities, and RandomizeUseragentHttpClient to make it harder for servers to block us
        $this->httpClient = (new RandomizeUseragentHttpClient(new NoPrivateNetworkHttpClient($httpClient)))->withOptions(
            [
                'timeout' => 15,
            ]
        );
    }

    public function getProviderInfo(): array
    {
        return [
            'name' => 'AI Web Extractor',
            'description' => 'Extract part info from any URL using LLM',
            //'url' => 'https://openrouter.ai',
            'disabled_help' => 'Configure AI settings',
            'settings_class' => AIExtractorSettings::class,
        ];
    }

    public function getProviderKey(): string
    {
        return 'ai_web';
    }

    public function isActive(): bool
    {
        return $this->settings->platform !== null && $this->settings->model !== null && $this->settings->model !== '';
    }

    public function searchByKeyword(string $keyword, array $options = []): array
    {
        $url = $this->fixAndValidateURL($keyword);

        if (!($options[self::OPTION_SKIP_DELEGATION] ?? false)) {
            //Before loading the page, try to delegate to another provider
            $delegatedPart = $this->createFromUrlHelper->delegateToOtherProvider($url, $this);
            if ($delegatedPart !== null) {
                return [$delegatedPart];
            }
        }

        try {

            $new_options = $options;
            $new_options[self::OPTION_SKIP_DELEGATION] = true; //Skip delegation for the getDetails call to prevent infinite loops

            return [
                $this->getDetails($keyword, $new_options)
            ]; } catch (ProviderIDNotSupportedException $e) {
            return [];
        }
    }

    public function getDetails(string $id, array $options = []): PartDetailDTO
    {
        $url = $this->fixAndValidateURL($id);

        if (!($options[self::OPTION_SKIP_DELEGATION] ?? false)) {
            //Before loading the page, try to delegate to another provider
            $delegatedPart = $this->createFromUrlHelper->delegateToOtherProviderDetails($url, $this);
            if ($delegatedPart !== null) {
                return $delegatedPart;
            }
        }

        //Check if we have a cached result for this URL, to avoid unnecessary LLM calls, which can be slow and costly.
        $cacheKey = 'ai_web_'.hash('xxh3', $url);

        //If ignore cache option is set, skip cache and fetch fresh data
        if ($options[self::OPTION_NO_CACHE] ?? false) {
            $this->partInfoCache->deleteItem($cacheKey);
        }

        //Return cached result if available
        $cacheItem = $this->partInfoCache->getItem($cacheKey);
        if ($cacheItem->isHit()) {
            return $cacheItem->get();
        }

        // Use pre-fetched browser HTML if the option is set and a stored page is available for this URL
        $html = null;
        if (($token = ($options[self::OPTION_SUBMITTED_PAGE_TOKEN] ?? '')) !== '') {
            $html = $this->browserHtmlStorage->retrieve($token)?->html;
        }

        //Otherwise fetch it ourselves.
        if ($html === null) {
            $response = $this->httpClient->request('GET', $url);
            $html = $response->getContent();
        }

        //Convert html to markdown, to provide a cleaner input to the LLM.
        $markdown = $this->htmlToMarkdown($html, $url);
        //Truncate markdown to max content length, if needed
        $markdown = u($markdown)->truncate($this->settings->maxContentLength, '... [truncated]')->toString();

        //Extract structured data using traditional methods, to provide additional context to the LLM. This can help improve accuracy, especially for technical specifications that might be in tables or specific formats.
        $structuredData = $this->extractStructuredData($html, $url);

        // Call LLM
        $llmResponse = $this->callLLM($markdown, $url, $structuredData);

        // Build and return PartDetailDTO
        $result = $this->jsonSchemaConverter->jsonToDTO($llmResponse, $this->getProviderKey(), $url, $url, self::DISTRIBUTOR_NAME);

        // Cache the result for future use, to improve performance and reduce costs.
        $cacheItem->set($result);
        $cacheItem->expiresAfter(3600 * 2); //Cache for 2 hours, as web content can change frequently, but we still want to benefit from caching for repeated accesses.
        $this->partInfoCache->save($cacheItem);

        return $result;
    }

    /**
     * Extracts structured data from the HTML using microformats.
     * @param  string  $html
     * @param  string  $url
     * @return string JSON encoded structured data
     */
    private function extractStructuredData(string $html, string $url): string
    {
        try {
            //Only parse microdata, json-ld and rdfa, as they are the most common formats for structured data on product pages. Links and microformat only create clutter for the LLM
            $micrometa = new Micrometa\Ports\Parser(Micrometa\Ports\Format::JSON_LD | Micrometa\Ports\Format::MICRODATA | Micrometa\Ports\Format::RDFA_LITE);
            $items = $micrometa($url, $html);
        } catch (\RuntimeException $exception) {
            //If parsing fails, try again without rdfa, as it seems to cause problems on pages like ebay
            try {
                $micrometa = new Micrometa\Ports\Parser(Micrometa\Ports\Format::JSON_LD | Micrometa\Ports\Format::MICRODATA);
                $items = $micrometa($url, $html);
            } catch (\RuntimeException $exception) {
                //If it still fails, return empty structured data
                return '{}';
            }
        }

        return json_encode($items->toObject(), JSON_THROW_ON_ERROR);
    }

    private function htmlToMarkdown(string $html, string $url): string
    {

        $crawler = new Crawler($html);

        //Replace relative URLs with absolute URLs, to ensure that the LLM has full context and can access the links if needed.
        $baseUrl = $crawler->getBaseHref() ?? $url;

        //Replace all relative links with their absolute counnterparts, to provide more context to the LLM and to ensure that any links included in the markdown are valid and can be accessed if needed.
        $crawler->filter('a')->each(function (Crawler $node) use ($baseUrl) {
            $href = $node->attr('href');
            if ($href) {
                $absoluteUrl = UriResolver::resolve($href, $baseUrl);
                //@phpstan-ignore-next-line we know that getNode(0) will always return a DOMElement, because the crawler is initialized with valid HTML and we are filtering for 'a' tags, which are always DOMElements.
                $node->getNode(0)->setAttribute('href', $absoluteUrl);
            }
        });

        $crawler->filter('img')->each(function (Crawler $node) use ($baseUrl) {
            $src = $node->attr('src');
            if ($src) {
                $absoluteUrl = UriResolver::resolve($src, $baseUrl);
                //@phpstan-ignore-next-line we know that getNode(0) will always return a DOMElement, because the crawler is initialized with valid HTML and we are filtering for 'a' tags, which are always DOMElements.
                $node->getNode(0)->setAttribute('src', $absoluteUrl);
            }
        });

        //Extract only the main content of the page to avoid overwhelming the LLM with irrelevant information.
        $mainContent = $crawler->filter('main, article, #content');

        // If we found a specific content area, get its HTML; otherwise, use the whole body.
        //Concat the html of all matched nodes, to provide more context to the LLM, especially for pages that use multiple sections for product info.
        if ($mainContent->count() > 0) {
            $htmlToConvert = '';
            foreach ($mainContent as $node) {
                $htmlToConvert .= $node->ownerDocument->saveHTML($node);
                $htmlToConvert .= "\n\n"; // Add some spacing between sections
            }
        } else {
            //Use the whole body content, as it might contain relevant information, especially for simpler pages that don't have a clear main/content section.
            $htmlToConvert = $crawler->outerHtml();
        }


        //Concert to markdown
        $converter = new HtmlConverter([
            'strip_tags' => true,      // Removes tags that aren't Markdown-compatible (like <div>)
            'hard_break' => true,      // Preserves line breaks
            'remove_nodes' => 'nav footer script style' // Extra safety layer
        ]);

        return $converter->convert($htmlToConvert);
    }

    public function getCapabilities(): array
    {
        return [
            ProviderCapabilities::BASIC,
            ProviderCapabilities::PICTURE,
            ProviderCapabilities::DATASHEET,
            ProviderCapabilities::PRICE,
            ProviderCapabilities::PARAMETERS,
        ];
    }

    private function callLLM(string $htmlContent, string $url, ?string $structuredData = null): array
    {
        $input = new MessageBag(
            Message::forSystem($this->buildSystemPrompt()),
            Message::ofUser("Extract part information from this webpage content:\n\nURL: $url\n\n$htmlContent")
        );

        if ($structuredData) {
            $input->add(Message::ofUser("Following data was extracted using traditional methods, but might be incomplete or inaccurate.
             Enrich it with the actual website data:\n\n".$structuredData));
        }

        try {
            $aiPlatform = $this->AIPlatformRegistry->getPlatform($this->settings->platform ?? throw new \RuntimeException('No AI platform selected') );

            //'openai/gpt-5-mini'
            $result = $aiPlatform->invoke($this->settings->model ?? throw new \RuntimeException('No model selected'), $input, [
                'response_format' => [
                    'type' => 'json_schema',
                    'json_schema' => $this->jsonSchemaConverter->getJSONSchema(),
                ]
            ]);
        } catch (\Throwable $e) {
            throw new \RuntimeException('LLM invocation failed: '.$e->getMessage(), previous: $e);
        }

        return $result->getResult()->getContent();
    }

    private function buildSystemPrompt(): string
    {
        $tmp = <<<'PROMPT'
You are an expert at extracting electronic component information from web pages. Extract structured data in JSON format, from markdown extracted from a product page.
Focus on the main content of the page, such as product descriptions, specifications, and tables. Ignore navigation menus, footers, and sidebars.

Rules:
- manufacturing_status: Use "active", "obsolete", "nrfnd" (not recommended for new designs), "discontinued", or null
- parameters: Extract technical specs like voltage, current, temperature, etc. and put them into the fields according to the JSON schema. Include units if available.
- prices: Extract pricing tiers with minimum_quantity, price, and currency code
- URLs must be absolute (include https://...)
- If information is not found, use null
- Try to avoid duplicating parameters, if the same parameter is mentioned multiple times, or if it is already used in another field.
- Include only the 1 to 3 most relevant images, such as the main product image or important diagrams. Ignore decorative images, logos, or icons.
- Extract GTIN / EAN if available, as it can be useful for matching parts across different sources, even if the part number is different.
- Include detailed product description into notes field, as it can contain important information that doesn't fit into other fields, such as features, applications, or unique selling points.

PROMPT;

        if ($this->settings->outputLanguage === null) {
            $tmp .= "\n\nProvide the response in the same language of the webpage.";
        } else {
            $tmp .= "\n\nThe response must be in ". Languages::getName($this->settings->outputLanguage, 'en') ." language. Translate texts if needed.";
        }

        if ($this->settings->additionalInstructions) {
            $tmp .= "\n\nAdditional instructions:\n" . $this->settings->additionalInstructions;
        }

        return $tmp;
    }

}