mirror of
https://github.com/Part-DB/Part-DB-server.git
synced 2026-05-10 07:02:13 +00:00
Improved AI extractor
It now gives better results and use less tokens
This commit is contained in:
parent
fcd598286a
commit
3c9866e90d
2 changed files with 7 additions and 2 deletions
|
|
@ -117,7 +117,7 @@ This provider can be particularly useful for extracting information from website
|
|||
It also potentially extracts more detailed information than the Generic Web URL Provider, as it is not limited to the fields defined in the Schema.org format.
|
||||
|
||||
To use the AI Web Extractor, you need to setup an AI platform, in the AI settings tab, and chose a model, which support structured output.
|
||||
For many use cases a small and cheap model like `google/gemini-2.5-flash-lite` will be sufficient, coming down to costs like 0.003$ per request.
|
||||
For many use cases a small and cheap model like `google/gemini-2.5-flash-lite` will be sufficient, coming down to costs like 0.001$ per request.
|
||||
For more complex websites, or if you wanna use the LLM for translation purposes too, you should consider a more powerful model.
|
||||
|
||||
You can add some additional instructions for the model, which gets added to the system prompt, to tweak the output of the model.
|
||||
|
|
|
|||
|
|
@ -32,6 +32,7 @@ use App\Services\InfoProviderSystem\DTOJsonSchemaConverter;
|
|||
use App\Services\InfoProviderSystem\DTOs\PartDetailDTO;
|
||||
use App\Settings\InfoProviderSystem\AIExtractorSettings;
|
||||
use Brick\Schema\SchemaReader;
|
||||
use Imagine\Image\Format;
|
||||
use Jkphl\Micrometa;
|
||||
use League\HTMLToMarkdown\HtmlConverter;
|
||||
use Psr\Cache\CacheItemPoolInterface;
|
||||
|
|
@ -174,7 +175,8 @@ final class AIWebProvider implements InfoProviderInterface
|
|||
*/
|
||||
private function extractStructuredData(string $html, string $url): string
|
||||
{
|
||||
$micrometa = new Micrometa\Ports\Parser();
|
||||
//Only parse microdata, json-ld and rdfa, as they are the most common formats for structured data on product pages. Links and microformat only create clutter for the LLM
|
||||
$micrometa = new Micrometa\Ports\Parser(Micrometa\Ports\Format::JSON_LD | Micrometa\Ports\Format::MICRODATA | Micrometa\Ports\Format::RDFA_LITE);
|
||||
$items = $micrometa($url, $html);
|
||||
|
||||
return json_encode($items->toObject(), JSON_THROW_ON_ERROR);
|
||||
|
|
@ -264,6 +266,9 @@ Rules:
|
|||
- If information is not found, use null
|
||||
- Try to avoid duplicating parameters, if the same parameter is mentioned multiple times, or if it is already used in another field.
|
||||
- Include only the 1 to 3 most relevant images, such as the main product image or important diagrams. Ignore decorative images, logos, or icons.
|
||||
- Extract GTIN / EAN if available, as it can be useful for matching parts across different sources, even if the part number is different.
|
||||
- Include detailed product description into notes field, as it can contain important information that doesn't fit into other fields, such as features, applications, or unique selling points.
|
||||
|
||||
PROMPT;
|
||||
|
||||
if ($this->settings->outputLanguage === null) {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue