From 38779740ec729b28e40a706f5fe66aa47b298263 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20B=C3=B6hmer?= Date: Tue, 5 May 2026 23:19:56 +0200 Subject: [PATCH] AIWebProvider: Make URLs absolute before passing them to the LLM This ensures that the URLs are valid afterwards, because the LLM does not know the base tag --- .../Providers/AIWebProvider.php | 31 ++++++++++++++++--- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/src/Services/InfoProviderSystem/Providers/AIWebProvider.php b/src/Services/InfoProviderSystem/Providers/AIWebProvider.php index 7f4a3586..164ee341 100644 --- a/src/Services/InfoProviderSystem/Providers/AIWebProvider.php +++ b/src/Services/InfoProviderSystem/Providers/AIWebProvider.php @@ -39,6 +39,7 @@ use Psr\Cache\CacheItemPoolInterface; use Symfony\AI\Platform\Message\Message; use Symfony\AI\Platform\Message\MessageBag; use Symfony\Component\DomCrawler\Crawler; +use Symfony\Component\DomCrawler\UriResolver; use Symfony\Component\HttpClient\NoPrivateNetworkHttpClient; use Symfony\Component\Intl\Languages; use Symfony\Contracts\HttpClient\HttpClientInterface; @@ -146,7 +147,7 @@ final class AIWebProvider implements InfoProviderInterface $html = $response->getContent(); //Convert html to markdown, to provide a cleaner input to the LLM. - $markdown = $this->htmlToMarkdown($html); + $markdown = $this->htmlToMarkdown($html, $url); //Truncate markdown to max content length, if needed $markdown = u($markdown)->truncate($this->settings->maxContentLength, '... [truncated]')->toString(); @@ -182,10 +183,32 @@ final class AIWebProvider implements InfoProviderInterface return json_encode($items->toObject(), JSON_THROW_ON_ERROR); } - private function htmlToMarkdown(string $html): string + private function htmlToMarkdown(string $html, string $url): string { - //Extract only the main content of the page to avoid overwhelming the LLM with irrelevant information. + $crawler = new Crawler($html); + + //Replace relative URLs with absolute URLs, to ensure that the LLM has full context and can access the links if needed. + $baseUrl = $crawler->getBaseHref() ?? $url; + + //Replace all relative links with their absolute counnterparts, to provide more context to the LLM and to ensure that any links included in the markdown are valid and can be accessed if needed. + $crawler->filter('a')->each(function (Crawler $node) use ($baseUrl) { + $href = $node->attr('href'); + if ($href) { + $absoluteUrl = UriResolver::resolve($href, $baseUrl); + $node->getNode(0)->setAttribute('href', $absoluteUrl); + } + }); + + $crawler->filter('img')->each(function (Crawler $node) use ($baseUrl) { + $src = $node->attr('src'); + if ($src) { + $absoluteUrl = UriResolver::resolve($src, $baseUrl); + $node->getNode(0)->setAttribute('src', $absoluteUrl); + } + }); + + //Extract only the main content of the page to avoid overwhelming the LLM with irrelevant information. $mainContent = $crawler->filter('main, article, #content'); // If we found a specific content area, get its HTML; otherwise, use the whole body. @@ -198,7 +221,7 @@ final class AIWebProvider implements InfoProviderInterface } } else { //Use the whole body content, as it might contain relevant information, especially for simpler pages that don't have a clear main/content section. - $htmlToConvert = $html; + $htmlToConvert = $crawler->outerHtml(); }