From 98c978ff1bb6a7effe17cd541dbc9f4513c2e483 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20B=C3=B6hmer?= Date: Tue, 5 May 2026 23:52:14 +0200 Subject: [PATCH] Improved RandomizeUseragentHttpClient by not using old user agent strings, but different modernn profiles where also other headers match the user agent --- src/Helpers/RandomizeUseragentHttpClient.php | 138 ++++++++++++++----- 1 file changed, 107 insertions(+), 31 deletions(-) diff --git a/src/Helpers/RandomizeUseragentHttpClient.php b/src/Helpers/RandomizeUseragentHttpClient.php index bca91c79..4b0d11b0 100644 --- a/src/Helpers/RandomizeUseragentHttpClient.php +++ b/src/Helpers/RandomizeUseragentHttpClient.php @@ -29,53 +29,128 @@ use Symfony\Contracts\HttpClient\ResponseStreamInterface; /** * HttpClient wrapper that randomizes the user agent for each request, to make it harder for servers to detect and block us. + * It also sets some other headers to make the requests look more like real browser requests. * When we get a 503, 403 or 429, we assume that the server is blocking us and try again with a different user agent, until we run out of retries. */ final class RandomizeUseragentHttpClient implements HttpClientInterface { - public const USER_AGENTS = [ - "Mozilla/5.0 (Windows; U; Windows NT 10.0; Win64; x64) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/52.0.1359.302 Safari/600.6 Edge/15.25690", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 Edge/16.16299", - "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 8_8_3) Gecko/20100101 Firefox/51.6", - "Mozilla/5.0 (Android; Android 4.4.4; E:number:20-23:00 Build/24.0.B.1.34) AppleWebKit/603.18 (KHTML, like Gecko) Chrome/47.0.1559.384 Mobile Safari/600.5", - "Mozilla/5.0 (compatible; MSIE 9.0; Windows; Windows NT 6.3; WOW64 Trident/5.0)", - "Mozilla/5.0 (Windows; Windows NT 6.0; Win64; x64) AppleWebKit/602.21 (KHTML, like Gecko) Chrome/51.0.3187.154 Safari/536", - "Mozilla/5.0 (iPhone; CPU iPhone OS 9_4_2; like Mac OS X) AppleWebKit/537.24 (KHTML, like Gecko) Chrome/51.0.2432.275 Mobile Safari/535.6", - "Mozilla/5.0 (U; Linux i680 ) Gecko/20100101 Firefox/57.5", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 8_8_6; en-US) Gecko/20100101 Firefox/53.9", - "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 8_6_7) AppleWebKit/534.46 (KHTML, like Gecko) Chrome/55.0.3276.345 Safari/535", - "Mozilla/5.0 (Windows; Windows NT 10.5;) AppleWebKit/535.42 (KHTML, like Gecko) Chrome/53.0.1176.353 Safari/534.0 Edge/11.95743", - "Mozilla/5.0 (Linux; Android 5.1.1; MOTO G Build/LPH223) AppleWebKit/600.27 (KHTML, like Gecko) Chrome/47.0.1604.204 Mobile Safari/535.1", - "Mozilla/5.0 (iPod; CPU iPod OS 7_4_8; like Mac OS X) AppleWebKit/534.17 (KHTML, like Gecko) Chrome/50.0.1632.146 Mobile Safari/600.4", - "Mozilla/5.0 (Linux; U; Linux i570 ; en-US) Gecko/20100101 Firefox/49.9", - "Mozilla/5.0 (Windows NT 10.2; WOW64; en-US) AppleWebKit/603.2 (KHTML, like Gecko) Chrome/55.0.1299.311 Safari/535", - "Mozilla/5.0 (Windows; Windows NT 10.5; x64; en-US) AppleWebKit/603.39 (KHTML, like Gecko) Chrome/52.0.1443.139 Safari/536.6 Edge/13.79436", - "Mozilla/5.0 (Linux; U; Android 5.1; SM-G9350T Build/MMB29M) AppleWebKit/537.15 (KHTML, like Gecko) Chrome/55.0.2552.307 Mobile Safari/600.8", - "Mozilla/5.0 (Android; Android 6.0; SAMSUNG SM-D9350V Build/MDB08L) AppleWebKit/535.30 (KHTML, like Gecko) Chrome/53.0.1345.278 Mobile Safari/537.4", - "Mozilla/5.0 (Windows; Windows NT 10.0;) AppleWebKit/534.44 (KHTML, like Gecko) Chrome/47.0.3503.387 Safari/601", + private const PROFILES = [ + // --- CHROME ON WINDOWS --- + 'chrome_windows' => [ + 'User-Agent' => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36', + 'Sec-Ch-Ua' => '"Google Chrome";v="142", "Chromium";v="142", "Not=A?Brand";v="99"', + 'Sec-Ch-Ua-Mobile' => '?0', + 'Sec-Ch-Ua-Platform' => '"Windows"', + 'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', + ], + + // --- CHROME ON MACOS --- + 'chrome_mac' => [ + 'User-Agent' => 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.0.0 Safari/537.36', + 'Sec-Ch-Ua' => '"Google Chrome";v="141", "Chromium";v="141", "Not=A?Brand";v="99"', + 'Sec-Ch-Ua-Mobile' => '?0', + 'Sec-Ch-Ua-Platform' => '"macOS"', + 'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', + ], + + // --- EDGE ON WINDOWS --- + 'edge_windows' => [ + 'User-Agent' => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36 Edg/142.0.0.0', + 'Sec-Ch-Ua' => '"Microsoft Edge";v="142", "Chromium";v="142", "Not=A?Brand";v="99"', + 'Sec-Ch-Ua-Mobile' => '?0', + 'Sec-Ch-Ua-Platform' => '"Windows"', + 'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', + ], + + // --- FIREFOX ON WINDOWS --- + 'firefox_windows' => [ + 'User-Agent' => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:138.0) Gecko/20100101 Firefox/138.0', + 'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/png,image/svg+xml,*/*;q=0.8', + 'Accept-Language' => 'en-US,en;q=0.5', + // Firefox does not send Sec-Ch-Ua headers by default + ], + + // --- FIREFOX ON LINUX --- + 'firefox_linux' => [ + 'User-Agent' => 'Mozilla/5.0 (X11; Linux x86_64; rv:137.0) Gecko/20100101 Firefox/137.0', + 'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/png,image/svg+xml,*/*;q=0.8', + 'Accept-Language' => 'en-US,en;q=0.5', + ], + + // --- SAFARI ON MACOS --- + 'safari_mac' => [ + 'User-Agent' => 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.0 Safari/605.1.15', + 'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'Accept-Language' => 'en-US,en;q=0.9', + ], + + // --- CHROME ON ANDROID (Mobile) --- + 'chrome_android' => [ + 'User-Agent' => 'Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Mobile Safari/537.36', + 'Sec-Ch-Ua' => '"Google Chrome";v="142", "Chromium";v="142", "Not=A?Brand";v="99"', + 'Sec-Ch-Ua-Mobile' => '?1', + 'Sec-Ch-Ua-Platform' => '"Android"', + 'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', + ], + + // --- SAFARI ON IPHONE (Mobile) --- + 'safari_iphone' => [ + 'User-Agent' => 'Mozilla/5.0 (iPhone; CPU iPhone OS 18_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.0 Mobile/15E148 Safari/604.1', + 'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'Accept-Language' => 'en-US,en;q=0.9', + ], ]; + private const COMMON_HEADERS = [ + 'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', + 'Accept-Language' => 'en-US,en;q=0.9', + 'Sec-Fetch-Dest' => 'document', + 'Sec-Fetch-Mode' => 'navigate', + 'Sec-Fetch-Site' => 'none', + 'Sec-Fetch-User' => '?1', + 'Upgrade-Insecure-Requests' => '1', + ]; + + private const ENTRY_REFERERS = [ + 'https://www.google.com/', + 'https://www.bing.com/', + 'https://duckduckgo.com/', + 'https://t.co/', // Twitter/X shortener + 'https://www.reddit.com/', + ]; + + private ?string $lastUrl = null; + public function __construct( private readonly HttpClientInterface $client, - private readonly array $userAgents = self::USER_AGENTS, private readonly int $repeatOnFailure = 1, ) { } - public function getRandomUserAgent(): string - { - return $this->userAgents[array_rand($this->userAgents)]; - } - public function request(string $method, string $url, array $options = []): ResponseInterface { $repeatsLeft = $this->repeatOnFailure; do { - $modifiedOptions = $options; - if (!isset($modifiedOptions['headers']['User-Agent'])) { - $modifiedOptions['headers']['User-Agent'] = $this->getRandomUserAgent(); + $profile = self::PROFILES[array_rand(self::PROFILES)]; + + // Merge common headers with the specific browser profile + $headers = array_merge(self::COMMON_HEADERS, $profile); + + //Add a Referer header if not already set, to make it look more like a real browser request. We use the last URL we visited as the referer, to simulate internal navigation. If we don't have a last URL (first request), we pick a random entry point from common referers. + if (!isset($options['headers']['Referer'])) { + if ($this->lastUrl !== null) { + // If we have a previous URL, use it (Internal Navigation) + $headers['Referer'] = $this->lastUrl; + } else { + // First request? Pick an entry point (External Entry) + $headers['Referer'] = self::ENTRY_REFERERS[array_rand(self::ENTRY_REFERERS)]; + } } - $response = $this->client->request($method, $url, $modifiedOptions); + + // Allow manual overrides from $options + $options['headers'] = array_merge($headers, $options['headers'] ?? []); + + $response = $this->client->request($method, $url, $options); //When we get a 503, 403 or 429, we assume that the server is blocking us and try again with a different user agent if (!in_array($response->getStatusCode(), [403, 429, 503], true)) { @@ -83,6 +158,7 @@ final class RandomizeUseragentHttpClient implements HttpClientInterface } //Otherwise we try again with a different user agent, until we run out of retries + usleep(5000); // Sleep for 5ms to avoid hammering the server too hard in case of multiple retries } while ($repeatsLeft-- > 0); return $response; @@ -95,6 +171,6 @@ final class RandomizeUseragentHttpClient implements HttpClientInterface public function withOptions(array $options): static { - return new self($this->client->withOptions($options), $this->userAgents, $this->repeatOnFailure); + return new self($this->client->withOptions($options), $this->repeatOnFailure); } }