mirror of
https://github.com/Part-DB/Part-DB-server.git
synced 2026-05-21 19:01:42 +00:00
Improved AI extractor
It now gives better results and use less tokens
This commit is contained in:
parent
fcd598286a
commit
3c9866e90d
2 changed files with 7 additions and 2 deletions
|
|
@ -117,7 +117,7 @@ This provider can be particularly useful for extracting information from website
|
||||||
It also potentially extracts more detailed information than the Generic Web URL Provider, as it is not limited to the fields defined in the Schema.org format.
|
It also potentially extracts more detailed information than the Generic Web URL Provider, as it is not limited to the fields defined in the Schema.org format.
|
||||||
|
|
||||||
To use the AI Web Extractor, you need to setup an AI platform, in the AI settings tab, and chose a model, which support structured output.
|
To use the AI Web Extractor, you need to setup an AI platform, in the AI settings tab, and chose a model, which support structured output.
|
||||||
For many use cases a small and cheap model like `google/gemini-2.5-flash-lite` will be sufficient, coming down to costs like 0.003$ per request.
|
For many use cases a small and cheap model like `google/gemini-2.5-flash-lite` will be sufficient, coming down to costs like 0.001$ per request.
|
||||||
For more complex websites, or if you wanna use the LLM for translation purposes too, you should consider a more powerful model.
|
For more complex websites, or if you wanna use the LLM for translation purposes too, you should consider a more powerful model.
|
||||||
|
|
||||||
You can add some additional instructions for the model, which gets added to the system prompt, to tweak the output of the model.
|
You can add some additional instructions for the model, which gets added to the system prompt, to tweak the output of the model.
|
||||||
|
|
|
||||||
|
|
@ -32,6 +32,7 @@ use App\Services\InfoProviderSystem\DTOJsonSchemaConverter;
|
||||||
use App\Services\InfoProviderSystem\DTOs\PartDetailDTO;
|
use App\Services\InfoProviderSystem\DTOs\PartDetailDTO;
|
||||||
use App\Settings\InfoProviderSystem\AIExtractorSettings;
|
use App\Settings\InfoProviderSystem\AIExtractorSettings;
|
||||||
use Brick\Schema\SchemaReader;
|
use Brick\Schema\SchemaReader;
|
||||||
|
use Imagine\Image\Format;
|
||||||
use Jkphl\Micrometa;
|
use Jkphl\Micrometa;
|
||||||
use League\HTMLToMarkdown\HtmlConverter;
|
use League\HTMLToMarkdown\HtmlConverter;
|
||||||
use Psr\Cache\CacheItemPoolInterface;
|
use Psr\Cache\CacheItemPoolInterface;
|
||||||
|
|
@ -174,7 +175,8 @@ final class AIWebProvider implements InfoProviderInterface
|
||||||
*/
|
*/
|
||||||
private function extractStructuredData(string $html, string $url): string
|
private function extractStructuredData(string $html, string $url): string
|
||||||
{
|
{
|
||||||
$micrometa = new Micrometa\Ports\Parser();
|
//Only parse microdata, json-ld and rdfa, as they are the most common formats for structured data on product pages. Links and microformat only create clutter for the LLM
|
||||||
|
$micrometa = new Micrometa\Ports\Parser(Micrometa\Ports\Format::JSON_LD | Micrometa\Ports\Format::MICRODATA | Micrometa\Ports\Format::RDFA_LITE);
|
||||||
$items = $micrometa($url, $html);
|
$items = $micrometa($url, $html);
|
||||||
|
|
||||||
return json_encode($items->toObject(), JSON_THROW_ON_ERROR);
|
return json_encode($items->toObject(), JSON_THROW_ON_ERROR);
|
||||||
|
|
@ -264,6 +266,9 @@ Rules:
|
||||||
- If information is not found, use null
|
- If information is not found, use null
|
||||||
- Try to avoid duplicating parameters, if the same parameter is mentioned multiple times, or if it is already used in another field.
|
- Try to avoid duplicating parameters, if the same parameter is mentioned multiple times, or if it is already used in another field.
|
||||||
- Include only the 1 to 3 most relevant images, such as the main product image or important diagrams. Ignore decorative images, logos, or icons.
|
- Include only the 1 to 3 most relevant images, such as the main product image or important diagrams. Ignore decorative images, logos, or icons.
|
||||||
|
- Extract GTIN / EAN if available, as it can be useful for matching parts across different sources, even if the part number is different.
|
||||||
|
- Include detailed product description into notes field, as it can contain important information that doesn't fit into other fields, such as features, applications, or unique selling points.
|
||||||
|
|
||||||
PROMPT;
|
PROMPT;
|
||||||
|
|
||||||
if ($this->settings->outputLanguage === null) {
|
if ($this->settings->outputLanguage === null) {
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue