Optimizing Zyte API usage#

Getting high throughput#

A single request to Zyte API can take tens of seconds to process. The response time depends on the target website and on the task performed (API features used). For example, if you use browserHtml feature, it is common to get a response in 10…30 seconds.

It means that, if requests are sent sequentially, the throughput could be quite low - a few responses per minute.

To speed up the processing (increase the throughput), send many requests in parallel, instead of sending them sequentially.

For example, if the average response time for your website is 15 seconds, and you want to achieve 1 RPS (1 response per second) speed, you should be sending 15 requests in parallel.

Note

Install and configure code example requirements to run the example below.

using System.Collections.Generic;
using System.Linq;
using System.Net;
using System.Net.Http;
using System.Text;
using System.Text.Json;
using System.Threading.Tasks;

var urls = new string[2];
urls[0] = "https://books.toscrape.com/catalogue/page-1.html";
urls[1] = "https://books.toscrape.com/catalogue/page-2.html";
var output = new List<HttpResponseMessage>();

var handler = new HttpClientHandler()
{
    AutomaticDecompression = DecompressionMethods.All,
    MaxConnectionsPerServer = 15
};
var client = new HttpClient(handler);

var apiKey = "YOUR_API_KEY";
var bytes = Encoding.GetEncoding("ISO-8859-1").GetBytes(apiKey + ":");
var auth = System.Convert.ToBase64String(bytes);
client.DefaultRequestHeaders.Add("Authorization", "Basic " + auth);

client.DefaultRequestHeaders.Add("Accept-Encoding", "br, gzip, deflate");

var responseTasks = new List<Task<HttpResponseMessage>>();
foreach (var url in urls)
{
    var input = new Dictionary<string, object>(){
        {"url", url},
        {"browserHtml", true}
    };
    var inputJson = JsonSerializer.Serialize(input);
    var content = new StringContent(inputJson, Encoding.UTF8, "application/json");
    var responseTask = client.PostAsync("https://api.zyte.com/v1/extract", content);
    responseTasks.Add(responseTask);
}

while (responseTasks.Any())
{
    var responseTask = await Task.WhenAny(responseTasks);
    responseTasks.Remove(responseTask);
    var response = await responseTask;
    output.Add(response);
}
input.jsonl#
{"url": "https://books.toscrape.com/catalogue/page-1.html", "browserHtml": true}
{"url": "https://books.toscrape.com/catalogue/page-2.html", "browserHtml": true}
cat input.jsonl \
| xargs -P 15 -d\\n -n 1 \
bash -c "
    curl \
        --user YOUR_API_KEY: \
        --header 'Content-Type: application/json' \
        --data \"\$0\" \
        --compressed \
        https://api.zyte.com/v1/extract \
    | awk '{print \$1}' \
    >> output.jsonl
"
input.jsonl#
{"url": "https://books.toscrape.com/catalogue/page-1.html", "browserHtml": true}
{"url": "https://books.toscrape.com/catalogue/page-2.html", "browserHtml": true}
zyte-api --n-conn 15 input.jsonl -o output.jsonl
import com.google.common.collect.ImmutableMap;
import com.google.gson.Gson;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Base64;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Future;
import org.apache.hc.client5.http.async.methods.SimpleHttpRequest;
import org.apache.hc.client5.http.async.methods.SimpleHttpResponse;
import org.apache.hc.client5.http.impl.async.CloseableHttpAsyncClient;
import org.apache.hc.client5.http.impl.async.HttpAsyncClients;
import org.apache.hc.client5.http.impl.nio.PoolingAsyncClientConnectionManager;
import org.apache.hc.client5.http.impl.nio.PoolingAsyncClientConnectionManagerBuilder;
import org.apache.hc.client5.http.ssl.ClientTlsStrategyBuilder;
import org.apache.hc.core5.concurrent.FutureCallback;
import org.apache.hc.core5.http.ContentType;
import org.apache.hc.core5.http.HttpHeaders;
import org.apache.hc.core5.http.ParseException;
import org.apache.hc.core5.http.nio.ssl.TlsStrategy;
import org.apache.hc.core5.reactor.ssl.TlsDetails;

class Example {
  private static final String API_KEY = "YOUR_API_KEY";

  public static void main(final String[] args)
      throws ExecutionException, InterruptedException, IOException, ParseException {

    String[] urls = {
      "https://books.toscrape.com/catalogue/page-1.html",
      "https://books.toscrape.com/catalogue/page-2.html"
    };
    List<Future> futures = new ArrayList<Future>();
    List<String> output = new ArrayList<String>();

    int concurrency = 15;

    // https://issues.apache.org/jira/browse/HTTPCLIENT-2219
    final TlsStrategy tlsStrategy =
        ClientTlsStrategyBuilder.create()
            .useSystemProperties()
            .setTlsDetailsFactory(
                sslEngine ->
                    new TlsDetails(sslEngine.getSession(), sslEngine.getApplicationProtocol()))
            .build();

    PoolingAsyncClientConnectionManager connectionManager =
        PoolingAsyncClientConnectionManagerBuilder.create().setTlsStrategy(tlsStrategy).build();
    connectionManager.setMaxTotal(concurrency);
    connectionManager.setDefaultMaxPerRoute(concurrency);

    CloseableHttpAsyncClient client =
        HttpAsyncClients.custom().setConnectionManager(connectionManager).build();
    try {
      client.start();
      for (int i = 0; i < urls.length; i++) {
        Map<String, Object> parameters = ImmutableMap.of("url", urls[i], "browserHtml", true);
        String requestBody = new Gson().toJson(parameters);

        SimpleHttpRequest request =
            new SimpleHttpRequest("POST", "https://api.zyte.com/v1/extract");
        request.setHeader(HttpHeaders.AUTHORIZATION, buildAuthHeader());
        request.setHeader(HttpHeaders.ACCEPT_ENCODING, "gzip, deflate");
        request.setBody(requestBody, ContentType.APPLICATION_JSON);

        final Future<SimpleHttpResponse> future =
            client.execute(
                request,
                new FutureCallback<SimpleHttpResponse>() {
                  public void completed(final SimpleHttpResponse response) {
                    String apiResponse = response.getBodyText();
                    JsonObject jsonObject = JsonParser.parseString(apiResponse).getAsJsonObject();
                    String browserHtml = jsonObject.get("browserHtml").getAsString();
                    output.add(browserHtml);
                  }

                  public void failed(final Exception ex) {}

                  public void cancelled() {}
                });
        futures.add(future);
      }
      for (int i = 0; i < futures.size(); i++) {
        futures.get(i).get();
      }
    } finally {
      client.close();
    }
  }

  private static String buildAuthHeader() {
    String auth = API_KEY + ":";
    String encodedAuth = Base64.getEncoder().encodeToString(auth.getBytes());
    return "Basic " + encodedAuth;
  }
}
const { ConcurrencyManager } = require('axios-concurrency')
const axios = require('axios')

const urls = [
  'https://books.toscrape.com/catalogue/page-1.html',
  'https://books.toscrape.com/catalogue/page-2.html'
]
const output = []

const client = axios.create()
ConcurrencyManager(client, 15)

Promise.all(
  urls.map((url) =>
    client.post(
        'https://api.zyte.com/v1/extract',
        { url, browserHtml: true },
        { 
          auth: { username: 'YOUR_API_KEY' },
          headers: { 'Accept-Encoding': 'gzip, deflate' } 
        }
      ).then((response) => output.push(response.data))
  )
)
<?php

$urls = [
  'https://books.toscrape.com/catalogue/page-1.html',
  'https://books.toscrape.com/catalogue/page-2.html',
];
$output = [];
$promises = [];

$client = new GuzzleHttp\Client();

foreach ($urls as $url) {
    $options = [
        'auth' => ['YOUR_API_KEY', ''],
        'headers' => ['Accept-Encoding' => 'gzip'],
        'json' => [
            'url' => $url,
            'browserHtml' => true,
        ],
    ];
    $request = new \GuzzleHttp\Psr7\Request('POST', 'https://api.zyte.com/v1/extract');
    global $promises;
    $promises[] = $client->sendAsync($request, $options)->then(function ($response) {
        global $output;
        $output[] = json_decode($response->getBody());
    });
}

foreach ($promises as $promise) {
    $promise->wait();
}
import asyncio

import aiohttp

urls = [
    'https://books.toscrape.com/catalogue/page-1.html',
    'https://books.toscrape.com/catalogue/page-2.html',
]
output = []


async def extract(client, url):
    response = await client.post(
        'https://api.zyte.com/v1/extract',
        json={'url': url, 'browserHtml': True},
        auth=aiohttp.BasicAuth('YOUR_API_KEY'),
    )
    output.append(await response.json())


async def main():
    connector = aiohttp.TCPConnector(limit_per_host=15)
    async with aiohttp.ClientSession(connector=connector) as client:
        await asyncio.gather(*[extract(client, url) for url in urls])

asyncio.run(main())
import asyncio

from zyte_api.aio.client import AsyncClient, create_session

urls = [
    'https://books.toscrape.com/catalogue/page-1.html',
    'https://books.toscrape.com/catalogue/page-2.html',
]
output = []


async def main():
    connection_count = 15
    client = AsyncClient(n_conn=connection_count)
    requests = [{'url': url, 'browserHtml': True} for url in urls]
    async with create_session(connection_count) as session:
        responses = client.request_parallel_as_completed(
            requests,
            session=session,
        )
        for response in responses:
            output.append(await response)

asyncio.run(main())
from scrapy import Request, Spider

urls = [
    "https://books.toscrape.com/catalogue/page-1.html",
    "https://books.toscrape.com/catalogue/page-2.html",
]


class ToScrapeSpider(Spider):
    name = "toscrape_com"

    custom_settings = {
        "CONCURRENT_REQUESTS": 15,
        "CONCURRENT_REQUESTS_PER_DOMAIN": 15,
    }

    def start_requests(self):
        for url in urls:
            yield Request(
                url,
                meta={
                    "zyte_api_automap": {
                        "browserHtml": True,
                    },
                },
            )

    def parse(self, response):
        yield {
            "url": response.url,
            "browserHtml": response.text,
        }

As you increase concurrency, you will eventually face rate limiting. It is a good practice to configure you client not to get too many 429 errors, by having proper concurrency options: reduce the amount of connections if you’re getting a lot of 429 errors, or slow down your client in some other way. However, getting a small percentage of 429 errors is normal and expected if you want to get close to the limits of your API key.