Zyte API data extraction#

Zyte API can extract data from any given URL.

Use data extraction#

To use Zyte API data extraction, you can use any of the following:

This usage documentation includes code examples for Zyte API client software and many other technologies. Code examples covers the requirements that you must meet before you can successfully run code from any code example.

Choose outputs#

Zyte API can extract the following outputs from a URL:

You can combine these output data fields in a single extract request, with the following restrictions:

In addition to output-specific options, in every Zyte API extract request you can customize the country of origin and set request metadata.

Use browser features#

An extract request can extract browser HTML, a screenshot, or both.

Extract browser HTML#

Zyte API can render a URL in a web browser and return an HTML representation of its Document Object Model (DOM).

Extracting browser HTML allows you to:

  • Extract dynamically-loaded webpage content without spending time recreating what the browser does through JavaScript and additional requests.

  • Emulate user interaction through browser actions.

To extract browser HTML, set the browserHtml key in your API request body to true.

The browserHtml key of the response JSON object is the browser HTML as a string.

using System.Collections.Generic;
using System.Net;
using System.Net.Http;
using System.Text;
using System.Text.Json;
using System.Threading.Tasks;

HttpClientHandler handler = new HttpClientHandler()
{
    AutomaticDecompression = DecompressionMethods.All
};
HttpClient client = new HttpClient(handler);

var apiKey = "YOUR_API_KEY";
var bytes = Encoding.GetEncoding("ISO-8859-1").GetBytes(apiKey + ":");
var auth = System.Convert.ToBase64String(bytes);
client.DefaultRequestHeaders.Add("Authorization", "Basic " + auth);

client.DefaultRequestHeaders.Add("Accept-Encoding", "br, gzip, deflate");

var input = new Dictionary<string, object>(){
    {"url", "https://toscrape.com"},
    {"browserHtml", true}
};
var inputJson = JsonSerializer.Serialize(input);
var content = new StringContent(inputJson, Encoding.UTF8, "application/json");

HttpResponseMessage response = await client.PostAsync("https://api.zyte.com/v1/extract", content);
var body = await response.Content.ReadAsByteArrayAsync();

var data = JsonDocument.Parse(body);
var browserHtml = data.RootElement.GetProperty("browserHtml").ToString();
input.json#
{"url": "https://toscrape.com", "browserHtml": true}
curl \
    --user YOUR_API_KEY: \
    --header 'Content-Type: application/json' \
    --data @input.json \
    --compressed \
    https://api.zyte.com/v1/extract \
| jq --raw-output .browserHtml
input.jsonl#
{"url": "https://toscrape.com", "browserHtml": true}
zyte-api input.jsonl 2> /dev/null \
| jq --raw-output .browserHtml
import com.google.common.collect.ImmutableMap;
import com.google.gson.Gson;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Base64;
import java.util.Map;
import org.apache.hc.client5.http.classic.methods.HttpPost;
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
import org.apache.hc.client5.http.impl.classic.CloseableHttpResponse;
import org.apache.hc.client5.http.impl.classic.HttpClients;
import org.apache.hc.core5.http.ContentType;
import org.apache.hc.core5.http.HttpEntity;
import org.apache.hc.core5.http.HttpHeaders;
import org.apache.hc.core5.http.ParseException;
import org.apache.hc.core5.http.io.entity.EntityUtils;
import org.apache.hc.core5.http.io.entity.StringEntity;

class Example {
  private static final String API_KEY = "YOUR_API_KEY";

  public static void main(final String[] args)
      throws InterruptedException, IOException, ParseException {
    Map<String, Object> parameters =
        ImmutableMap.of("url", "https://toscrape.com", "browserHtml", true);
    String requestBody = new Gson().toJson(parameters);

    HttpPost request = new HttpPost("https://api.zyte.com/v1/extract");
    request.setHeader(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON);
    request.setHeader(HttpHeaders.ACCEPT_ENCODING, "gzip, deflate");
    request.setHeader(HttpHeaders.AUTHORIZATION, buildAuthHeader());
    request.setEntity(new StringEntity(requestBody));

    try (CloseableHttpClient client = HttpClients.createDefault()) {
      try (CloseableHttpResponse response = client.execute(request)) {
        HttpEntity entity = response.getEntity();
        String apiResponse = EntityUtils.toString(entity, StandardCharsets.UTF_8);
        JsonObject jsonObject = JsonParser.parseString(apiResponse).getAsJsonObject();
        String browserHtml = jsonObject.get("browserHtml").getAsString();
      }
    }
  }

  private static String buildAuthHeader() {
    String auth = API_KEY + ":";
    String encodedAuth = Base64.getEncoder().encodeToString(auth.getBytes());
    return "Basic " + encodedAuth;
  }
}
const axios = require('axios')

axios.post(
    'https://api.zyte.com/v1/extract',
    {
      url: 'https://toscrape.com',
      browserHtml: true
    },
    {
      auth: { username: 'YOUR_API_KEY' },
      headers: { 'Accept-Encoding': 'gzip, deflate' }
    }
  ).then((response) => {
    const browserHtml = response.data.browserHtml
  })
<?php

$client = new GuzzleHttp\Client();
$response = $client->request('POST', 'https://api.zyte.com/v1/extract', [
    'auth' => ['YOUR_API_KEY', ''],
    'headers' => ['Accept-Encoding' => 'gzip'],
    'json' => [
        'url' => 'https://toscrape.com',
        'browserHtml' => true,
    ],
]);
$api = json_decode($response->getBody());
$browser_html = $api->browserHtml;
import requests

api_response = requests.post(
    'https://api.zyte.com/v1/extract',
    auth=('YOUR_API_KEY', ''),
    json={
       'url': 'https://toscrape.com',
        'browserHtml': True,
    },
)
browser_html: str = api_response.json()['browserHtml']
import asyncio

from zyte_api.aio.client import AsyncClient

async def main():
    client = AsyncClient()
    api_response = await client.request_raw(
        {
            'url': 'https://toscrape.com',
            'browserHtml': True,
        }
    )
    browser_html: str = api_response['browserHtml']

asyncio.run(main())
from scrapy import Request, Spider


class ToScrapeSpider(Spider):
    name = "toscrape_com"

    def start_requests(self):
        yield Request(
            "https://toscrape.com",
            meta={
                "zyte_api_automap": {
                    "browserHtml": True,
                },
            },
        )

    def parse(self, response):
        browser_html: str = response.text

If you need non-HTML content, non-GET requests, or request headers beyond those supported for browser-based extraction, extract a response body instead.

Extract a screenshot#

You can set the screenshot key in your API request body to true to extract a screenshot.

The screenshot key of the response JSON object is the Base64-encoded screenshot.

using System.Collections.Generic;
using System.Net;
using System.Net.Http;
using System.Text;
using System.Text.Json;
using System.Threading.Tasks;

HttpClientHandler handler = new HttpClientHandler()
{
    AutomaticDecompression = DecompressionMethods.All
};
HttpClient client = new HttpClient(handler);

var apiKey = "YOUR_API_KEY";
var bytes = Encoding.GetEncoding("ISO-8859-1").GetBytes(apiKey + ":");
var auth = System.Convert.ToBase64String(bytes);
client.DefaultRequestHeaders.Add("Authorization", "Basic " + auth);

client.DefaultRequestHeaders.Add("Accept-Encoding", "br, gzip, deflate");

var input = new Dictionary<string, object>(){
    {"url", "https://toscrape.com"},
    {"screenshot", true}
};
var inputJson = JsonSerializer.Serialize(input);
var content = new StringContent(inputJson, Encoding.UTF8, "application/json");

HttpResponseMessage response = await client.PostAsync("https://api.zyte.com/v1/extract", content);
var body = await response.Content.ReadAsByteArrayAsync();

var data = JsonDocument.Parse(body);
var base64Screenshot = data.RootElement.GetProperty("screenshot").ToString();
var screenshot = System.Convert.FromBase64String(base64Screenshot);
input.json#
{"url": "https://toscrape.com", "screenshot": true}
curl \
    --user YOUR_API_KEY: \
    --header 'Content-Type: application/json' \
    --data @input.json \
    --compressed \
    https://api.zyte.com/v1/extract \
| jq --raw-output .screenshot \
| base64 --decode \
> screenshot.jpg
input.jsonl#
{"url": "https://toscrape.com", "screenshot": true}
zyte-api input.jsonl 2> /dev/null \
| jq --raw-output .screenshot \
| base64 --decode \
> screenshot.jpg
import com.google.common.collect.ImmutableMap;
import com.google.gson.Gson;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Base64;
import java.util.Map;
import org.apache.hc.client5.http.classic.methods.HttpPost;
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
import org.apache.hc.client5.http.impl.classic.CloseableHttpResponse;
import org.apache.hc.client5.http.impl.classic.HttpClients;
import org.apache.hc.core5.http.ContentType;
import org.apache.hc.core5.http.HttpEntity;
import org.apache.hc.core5.http.HttpHeaders;
import org.apache.hc.core5.http.ParseException;
import org.apache.hc.core5.http.io.entity.EntityUtils;
import org.apache.hc.core5.http.io.entity.StringEntity;

class Example {
  private static final String API_KEY = "YOUR_API_KEY";

  public static void main(final String[] args)
      throws InterruptedException, IOException, ParseException {
    Map<String, Object> parameters =
        ImmutableMap.of("url", "https://toscrape.com", "screenshot", true);
    String requestBody = new Gson().toJson(parameters);

    HttpPost request = new HttpPost("https://api.zyte.com/v1/extract");
    request.setHeader(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON);
    request.setHeader(HttpHeaders.ACCEPT_ENCODING, "gzip, deflate");
    request.setHeader(HttpHeaders.AUTHORIZATION, buildAuthHeader());
    request.setEntity(new StringEntity(requestBody));

    try (CloseableHttpClient client = HttpClients.createDefault()) {
      try (CloseableHttpResponse response = client.execute(request)) {
        HttpEntity entity = response.getEntity();
        String apiResponse = EntityUtils.toString(entity, StandardCharsets.UTF_8);
        JsonObject jsonObject = JsonParser.parseString(apiResponse).getAsJsonObject();
        String base64Screenshot = jsonObject.get("screenshot").getAsString();
        byte[] screenshot = Base64.getDecoder().decode(base64Screenshot);
      }
    }
  }

  private static String buildAuthHeader() {
    String auth = API_KEY + ":";
    String encodedAuth = Base64.getEncoder().encodeToString(auth.getBytes());
    return "Basic " + encodedAuth;
  }
}
const axios = require('axios')

axios.post(
    'https://api.zyte.com/v1/extract',
    {
      url: 'https://toscrape.com',
      screenshot: true
    },
    {
      auth: { username: 'YOUR_API_KEY' },
      headers: { 'Accept-Encoding': 'gzip, deflate' }
    }
  ).then((response) => {
    const screenshot = Buffer.from(response.data.screenshot, 'base64')
  })
<?php

$client = new GuzzleHttp\Client();
$response = $client->request('POST', 'https://api.zyte.com/v1/extract', [
    'auth' => ['YOUR_API_KEY', ''],
    'headers' => ['Accept-Encoding' => 'gzip'],
    'json' => [
        'url' => 'https://toscrape.com',
        'screenshot' => true,
    ],
]);
$api = json_decode($response->getBody());
$screenshot = base64_decode($api->screenshot);
from base64 import b64decode

import requests

api_response = requests.post(
    'https://api.zyte.com/v1/extract',
    auth=('YOUR_API_KEY', ''),
    json={
        'url': 'https://toscrape.com',
        'screenshot': True,
    },
)
screenshot: bytes = b64decode(api_response.json()['screenshot'])
import asyncio
from base64 import b64decode

from zyte_api.aio.client import AsyncClient

async def main():
    client = AsyncClient()
    api_response = await client.request_raw(
        {
            'url': 'https://toscrape.com',
            'screenshot': True,
        }
    )
    screenshot: bytes = b64decode(api_response['screenshot'])

asyncio.run(main())
from base64 import b64decode

from scrapy import Request, Spider


class ToScrapeComSpider(Spider):
    name = "toscrape_com"

    def start_requests(self):
        yield Request(
            "https://toscrape.com",
            meta={
                "zyte_api_automap": {
                    "screenshot": True,
                },
            },
        )

    def parse(self, response):
        screenshot: bytes = b64decode(response.raw_api_response["screenshot"])

Use browser actions to modify a webpage through simulated user interaction before the screenshot is taken.

You may also define an screenshotOptions key in your API request body to configure the format and scope of the screenshot. For more information, look up screenshotOptions in Zyte API HTTP API.

Set browser actions#

When extracting browser HTML or a screenshot, you can use the actions key in your API request body to define a sequence of actions to perform during browser rendering, and hence modify the DOM before the requested output is generated for you.

using System.Collections.Generic;
using System.Net;
using System.Net.Http;
using System.Text;
using System.Text.Json;
using System.Threading.Tasks;
using HtmlAgilityPack;

HttpClientHandler handler = new HttpClientHandler()
{
    AutomaticDecompression = DecompressionMethods.All
};
HttpClient client = new HttpClient(handler);

var apiKey = "YOUR_API_KEY";
var bytes = Encoding.GetEncoding("ISO-8859-1").GetBytes(apiKey + ":");
var auth = System.Convert.ToBase64String(bytes);
client.DefaultRequestHeaders.Add("Authorization", "Basic " + auth);

client.DefaultRequestHeaders.Add("Accept-Encoding", "br, gzip, deflate");

var input = new Dictionary<string, object>(){
    {"url", "https://quotes.toscrape.com/scroll"},
    {"browserHtml", true},
    {
        "actions",
        new List<Dictionary<string, object>>()
        {
            new Dictionary<string, object>()
            {
                {"action", "scrollBottom"}
            }
        }
    }
};
var inputJson = JsonSerializer.Serialize(input);
var content = new StringContent(inputJson, Encoding.UTF8, "application/json");

HttpResponseMessage response = await client.PostAsync("https://api.zyte.com/v1/extract", content);
var body = await response.Content.ReadAsByteArrayAsync();

var data = JsonDocument.Parse(body);
var browserHtml = data.RootElement.GetProperty("browserHtml").ToString();
var htmlDocument = new HtmlDocument();
htmlDocument.LoadHtml(browserHtml);
var navigator = htmlDocument.CreateNavigator();
var quoteCount = (double)navigator.Evaluate("count(//*[@class=\"quote\"])");
input.json#
{
    "url": "https://quotes.toscrape.com/scroll",
    "browserHtml": true,
    "actions": [
        {
            "action": "scrollBottom"
        }
    ]
}
curl \
    --user YOUR_API_KEY: \
    --header 'Content-Type: application/json' \
    --data @input.json \
    --compressed \
    https://api.zyte.com/v1/extract \
| jq --raw-output .browserHtml \
| xmllint --html --xpath 'count(//*[@class="quote"])' - 2> /dev/null
input.jsonl#
{"url":"https://quotes.toscrape.com/scroll","browserHtml":true,"actions":[{"action":"scrollBottom"}]}
zyte-api input.jsonl 2> /dev/null \
| jq --raw-output .browserHtml \
| xmllint --html --xpath 'count(//*[@class="quote"])' - 2> /dev/null
import com.google.common.collect.ImmutableMap;
import com.google.gson.Gson;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Base64;
import java.util.Collections;
import java.util.Map;
import org.apache.hc.client5.http.classic.methods.HttpPost;
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
import org.apache.hc.client5.http.impl.classic.CloseableHttpResponse;
import org.apache.hc.client5.http.impl.classic.HttpClients;
import org.apache.hc.core5.http.ContentType;
import org.apache.hc.core5.http.HttpEntity;
import org.apache.hc.core5.http.HttpHeaders;
import org.apache.hc.core5.http.ParseException;
import org.apache.hc.core5.http.io.entity.EntityUtils;
import org.apache.hc.core5.http.io.entity.StringEntity;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

class Example {

  private static final String API_KEY = "YOUR_API_KEY";

  public static void main(final String[] args)
      throws InterruptedException, IOException, ParseException {
    Map<String, Object> action = ImmutableMap.of("action", "scrollBottom");
    Map<String, Object> parameters =
        ImmutableMap.of(
            "url",
            "https://quotes.toscrape.com/scroll",
            "browserHtml",
            true,
            "actions",
            Collections.singletonList(action));
    String requestBody = new Gson().toJson(parameters);

    HttpPost request = new HttpPost("https://api.zyte.com/v1/extract");
    request.setHeader(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON);
    request.setHeader(HttpHeaders.ACCEPT_ENCODING, "gzip, deflate");
    request.setHeader(HttpHeaders.AUTHORIZATION, buildAuthHeader());
    request.setEntity(new StringEntity(requestBody));

    try (CloseableHttpClient client = HttpClients.createDefault()) {
      try (CloseableHttpResponse response = client.execute(request)) {
        HttpEntity entity = response.getEntity();
        String apiResponse = EntityUtils.toString(entity, StandardCharsets.UTF_8);
        JsonObject jsonObject = JsonParser.parseString(apiResponse).getAsJsonObject();
        String browserHtml = jsonObject.get("browserHtml").getAsString();
        Document document = Jsoup.parse(browserHtml);
        int quoteCount = document.select(".quote").size();
      }
    }
  }

  private static String buildAuthHeader() {
    String auth = API_KEY + ":";
    String encodedAuth = Base64.getEncoder().encodeToString(auth.getBytes());
    return "Basic " + encodedAuth;
  }
}
const axios = require('axios')
const cheerio = require('cheerio')

axios.post(
    'https://api.zyte.com/v1/extract',
    {
      url: 'https://quotes.toscrape.com/scroll',
      browserHtml: true,
      actions: [
        {
          action: 'scrollBottom'
        }
      ]
    },
    {
      auth: { username: 'YOUR_API_KEY' },
      headers: { 'Accept-Encoding': 'gzip, deflate' }
    }
  ).then((response) => {
    const browserHtml = response.data.browserHtml
    const $ = cheerio.load(browserHtml)
    const quoteCount = $('.quote').length
  })
<?php

$client = new GuzzleHttp\Client();
$response = $client->request('POST', 'https://api.zyte.com/v1/extract', [
    'auth' => ['YOUR_API_KEY', ''],
    'headers' => ['Accept-Encoding' => 'gzip'],
    'json' => [
        'url' => 'https://quotes.toscrape.com/scroll',
        'browserHtml' => true,
        'actions' => [
            ['action' => 'scrollBottom'],
        ],
    ],
]);
$data = json_decode($response->getBody());
$doc = new DOMDocument();
$doc->loadHTML($data->browserHtml);
$xpath = new DOMXPath($doc);
$quote_count = $xpath->query("//*[@class='quote']")->count();
import json

import requests
from parsel import Selector

api_response = requests.post(
    'https://api.zyte.com/v1/extract',
    auth=('YOUR_API_KEY', ''),
    json={
        'url': 'https://quotes.toscrape.com/scroll',
        'browserHtml': True,
        'actions': [
            {
                'action': 'scrollBottom',
            },
        ],
    },
)
browser_html = api_response.json()['browserHtml']
quote_count = len(Selector(browser_html).css('.quote'))
import asyncio
import json

from parsel import Selector
from zyte_api.aio.client import AsyncClient

async def main():
    client = AsyncClient()
    api_response = await client.request_raw(
        {
            'url': 'https://quotes.toscrape.com/scroll',
            'browserHtml': True,
            'actions': [
                {
                    'action': 'scrollBottom',
                },
            ],
        },
    )
    browser_html = api_response['browserHtml']
    quote_count = len(Selector(browser_html).css('.quote'))

asyncio.run(main())
from scrapy import Request, Spider


class QuotesToScrapeComSpider(Spider):
    name = "quotes_toscrape_com"

    def start_requests(self):
        yield Request(
            "https://quotes.toscrape.com/scroll",
            meta={
                "zyte_api_automap": {
                    "browserHtml": True,
                    "actions": [
                        {
                            "action": "scrollBottom",
                        },
                    ],
                },
            },
        )

    def parse(self, response):
        quote_count = len(response.css(".quote"))

Zyte API supports 3 types of actions:

  • Generic actions work on every website. They allow you to type text into input fields, perform cursor actions (click, hover, scroll), and wait for certain events or for a given time.

  • Special actions expose functionality that requires specific knowledge of the target website, such as using their search box or filling a form.

    They are only available for certain websites. To find out if an action is available for a given website, send a test request using that action. If the action is not supported, you will get an error API response indicating so.

  • Custom actions, available for Enterprise plans, invoke a custom script.

You are free to use as many actions as you wish, but total browser execution time is limited to 60 seconds. If your actions are still running by that time, the on-going action is interrupted, follow-up actions are not executed at all, and you get your requested output (browser HTML, screenshot) as it was rendered at that time.

The Zyte API response includes an action key that provides details about action execution, including elapsedTime, error, and status fields to help you debug your actions, e.g. to find out which actions were executed successfully and which actions were not.

Look up actions in the specification for the complete actions API.

Set request headers#

When extracting browser HTML or a screenshot, you can set the requestHeaders key in your API request body to an object where keys are camelCase header names and values are header values, representing headers to include in your request.

using System.Collections.Generic;
using System.Net;
using System.Net.Http;
using System.Text;
using System.Text.Json;
using System.Threading.Tasks;
using System.Xml.XPath;
using HtmlAgilityPack;

HttpClientHandler handler = new HttpClientHandler()
{
    AutomaticDecompression = DecompressionMethods.All
};
HttpClient client = new HttpClient(handler);

var apiKey = "YOUR_API_KEY";
var bytes = Encoding.GetEncoding("ISO-8859-1").GetBytes(apiKey + ":");
var auth = System.Convert.ToBase64String(bytes);
client.DefaultRequestHeaders.Add("Authorization", "Basic " + auth);

client.DefaultRequestHeaders.Add("Accept-Encoding", "br, gzip, deflate");

var input = new Dictionary<string, object>(){
    {"url", "https://httpbin.org/anything"},
    {"browserHtml", true},
    {
        "requestHeaders",
        new Dictionary<string, object>()
        {
            {"referer", "https://example.org/"}
        }
    }
};
var inputJson = JsonSerializer.Serialize(input);
var content = new StringContent(inputJson, Encoding.UTF8, "application/json");

HttpResponseMessage response = await client.PostAsync("https://api.zyte.com/v1/extract", content);
var body = await response.Content.ReadAsByteArrayAsync();

var data = JsonDocument.Parse(body);
var browserHtml = data.RootElement.GetProperty("browserHtml").ToString();
var htmlDocument = new HtmlDocument();
htmlDocument.LoadHtml(browserHtml);
var navigator = htmlDocument.CreateNavigator();
var nodeIterator = (XPathNodeIterator)navigator.Evaluate("//text()");
nodeIterator.MoveNext();
var responseJson = nodeIterator.Current.ToString();
var responseData = JsonDocument.Parse(responseJson);
var headerEnumerator = responseData.RootElement.GetProperty("headers").EnumerateObject();
var headers = new Dictionary<string, string>();
while (headerEnumerator.MoveNext())
{
    headers.Add(
        headerEnumerator.Current.Name.ToString(),
        headerEnumerator.Current.Value.ToString()
    );
}
input.json#
{
    "url": "https://httpbin.org/anything",
    "browserHtml": true,
    "requestHeaders": {
        "referer": "https://example.org/"
    }
}
curl \
    --user YOUR_API_KEY: \
    --header 'Content-Type: application/json' \
    --data @input.json \
    --compressed \
    https://api.zyte.com/v1/extract \
| jq --raw-output .browserHtml \
| xmllint --html --xpath '//text()' - 2> /dev/null \
| jq .headers
input.jsonl#
{"url": "https://httpbin.org/anything", "browserHtml": true, "requestHeaders": {"referer": "https://example.org/"}}
zyte-api input.jsonl 2> /dev/null \
| jq --raw-output .browserHtml \
| xmllint --html --xpath '//text()' - 2> /dev/null \
| jq .headers
import com.google.common.collect.ImmutableMap;
import com.google.gson.Gson;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Base64;
import java.util.Map;
import org.apache.hc.client5.http.classic.methods.HttpPost;
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
import org.apache.hc.client5.http.impl.classic.CloseableHttpResponse;
import org.apache.hc.client5.http.impl.classic.HttpClients;
import org.apache.hc.core5.http.ContentType;
import org.apache.hc.core5.http.HttpEntity;
import org.apache.hc.core5.http.HttpHeaders;
import org.apache.hc.core5.http.ParseException;
import org.apache.hc.core5.http.io.entity.EntityUtils;
import org.apache.hc.core5.http.io.entity.StringEntity;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

class Example {
  private static final String API_KEY = "YOUR_API_KEY";

  public static void main(final String[] args)
      throws InterruptedException, IOException, ParseException {
    Map<String, Object> requestHeaders = ImmutableMap.of("referer", "https://example.org/");
    Map<String, Object> parameters =
        ImmutableMap.of(
            "url",
            "https://httpbin.org/anything",
            "browserHtml",
            true,
            "requestHeaders",
            requestHeaders);
    String requestBody = new Gson().toJson(parameters);

    HttpPost request = new HttpPost("https://api.zyte.com/v1/extract");
    request.setHeader(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON);
    request.setHeader(HttpHeaders.ACCEPT_ENCODING, "gzip, deflate");
    request.setHeader(HttpHeaders.AUTHORIZATION, buildAuthHeader());
    request.setEntity(new StringEntity(requestBody));

    try (CloseableHttpClient client = HttpClients.createDefault()) {
      try (CloseableHttpResponse response = client.execute(request)) {
        HttpEntity entity = response.getEntity();
        String apiResponse = EntityUtils.toString(entity, StandardCharsets.UTF_8);
        JsonObject jsonObject = JsonParser.parseString(apiResponse).getAsJsonObject();
        String browserHtml = jsonObject.get("browserHtml").getAsString();
        Document document = Jsoup.parse(browserHtml);
        JsonObject data = JsonParser.parseString(document.text()).getAsJsonObject();
        JsonObject headers = data.get("headers").getAsJsonObject();
      }
    }
  }

  private static String buildAuthHeader() {
    String auth = API_KEY + ":";
    String encodedAuth = Base64.getEncoder().encodeToString(auth.getBytes());
    return "Basic " + encodedAuth;
  }
}
const axios = require('axios')
const cheerio = require('cheerio')

axios.post(
    'https://api.zyte.com/v1/extract',
    {
      url: 'https://httpbin.org/anything',
      browserHtml: true,
      requestHeaders: {
        referer: 'https://example.org/'
      }
    },
    {
      auth: { username: 'YOUR_API_KEY' },
      headers: { 'Accept-Encoding': 'gzip, deflate' }
    }
  ).then((response) => {
    const $ = cheerio.load(response.data.browserHtml)
    const data = JSON.parse($.text())
    const headers = data.headers
  })
<?php

$client = new GuzzleHttp\Client();
$response = $client->request('POST', 'https://api.zyte.com/v1/extract', [
    'auth' => ['YOUR_API_KEY', ''],
    'headers' => ['Accept-Encoding' => 'gzip'],
    'json' => [
        'url' => 'https://httpbin.org/anything',
        'browserHtml' => true,
        'requestHeaders' => [
            'referer' => 'https://example.org/',
        ],
    ],
]);
$api = json_decode($response->getBody());
$doc = new DOMDocument();
$doc->loadHTML($api->browserHtml);
$data = json_decode($doc->textContent);
$headers = $data->headers;
import json

import requests
from parsel import Selector

api_response = requests.post(
    'https://api.zyte.com/v1/extract',
    auth=('YOUR_API_KEY', ''),
    json={
        'url': 'https://httpbin.org/anything',
        'browserHtml': True,
        'requestHeaders': {
            'referer': 'https://example.org/',
        },
    },
)
browser_html = api_response.json()['browserHtml']
selector = Selector(browser_html)
response_json = selector.xpath('//text()').get()
response_data = json.loads(response_json)
headers = response_data['headers']
import asyncio
import json

from parsel import Selector
from zyte_api.aio.client import AsyncClient

async def main():
    client = AsyncClient()
    api_response = await client.request_raw(
        {
            'url': 'https://httpbin.org/anything',
            'browserHtml': True,
            'requestHeaders': {
                'referer': 'https://example.org/',
            },
        }
    )
    browser_html = api_response['browserHtml']
    selector = Selector(browser_html)
    response_json = selector.xpath('//text()').get()
    response_data = json.loads(response_json)
    headers = response_data['headers']

asyncio.run(main())
import json

from scrapy import Request, Spider


class HTTPBinOrgSpider(Spider):
    name = "httpbin_org"

    def start_requests(self):
        yield Request(
            "https://httpbin.org/anything",
            headers={"Referer": "https://example.org/"},
            meta={
                "zyte_api_automap": {
                    "browserHtml": True,
                },
            },
        )

    def parse(self, response):
        response_json = response.xpath("//text()").get()
        response_data = json.loads(response_json)
        headers = response_data["headers"]

At the moment, only the Referer header can be overridden this way. If you need to override additional headers, extract a response body instead, using its request header definition property (customHttpRequestHeaders).

Enable or disable JavaScript#

When extracting browser HTML, JavaScript execution is enabled by default for most websites.

For some websites, however, JavaScript execution is disabled by default because it helps data extraction.

You can set the javascript key in your API request body to true or false to force enabling or disabling JavaScript execution, regardless of the default value for the target website.

using System.Collections.Generic;
using System.Net;
using System.Net.Http;
using System.Text;
using System.Text.Json;
using System.Threading.Tasks;
using System.Xml.XPath;
using HtmlAgilityPack;

HttpClientHandler handler = new HttpClientHandler()
{
    AutomaticDecompression = DecompressionMethods.All
};
HttpClient client = new HttpClient(handler);

var apiKey = "YOUR_API_KEY";
var bytes = Encoding.GetEncoding("ISO-8859-1").GetBytes(apiKey + ":");
var auth = System.Convert.ToBase64String(bytes);
client.DefaultRequestHeaders.Add("Authorization", "Basic " + auth);

client.DefaultRequestHeaders.Add("Accept-Encoding", "br, gzip, deflate");

var input = new Dictionary<string, object>(){
    {"url", "https://www.whatismybrowser.com/detect/is-javascript-enabled"},
    {"browserHtml", true},
    {"javascript", false}
};
var inputJson = JsonSerializer.Serialize(input);
var content = new StringContent(inputJson, Encoding.UTF8, "application/json");

HttpResponseMessage response = await client.PostAsync("https://api.zyte.com/v1/extract", content);
var body = await response.Content.ReadAsByteArrayAsync();

var data = JsonDocument.Parse(body);
var browserHtml = data.RootElement.GetProperty("browserHtml").ToString();
var htmlDocument = new HtmlDocument();
htmlDocument.LoadHtml(browserHtml);
var navigator = htmlDocument.CreateNavigator();
var nodeIterator = (XPathNodeIterator)navigator.Evaluate("//*[@id=\"detected_value\"]/text()");
nodeIterator.MoveNext();
var isJavaScriptEnabled = nodeIterator.Current.ToString();
input.json#
{
    "url": "https://www.whatismybrowser.com/detect/is-javascript-enabled",
    "browserHtml": true,
    "javascript": false
}
curl \
    --user YOUR_API_KEY: \
    --header 'Content-Type: application/json' \
    --data @input.json \
    --compressed \
    https://api.zyte.com/v1/extract \
| jq --raw-output .browserHtml \
| xmllint --html --xpath '//*[@id="detected_value"]/text()' - 2> /dev/null
input.jsonl#
{"url": "https://www.whatismybrowser.com/detect/is-javascript-enabled", "browserHtml": true, "javascript": false}
zyte-api input.jsonl 2> /dev/null \
| jq --raw-output .browserHtml \
| xmllint --html --xpath '//*[@id="detected_value"]/text()' - 2> /dev/null
import com.google.common.collect.ImmutableMap;
import com.google.gson.Gson;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Base64;
import java.util.Map;
import org.apache.hc.client5.http.classic.methods.HttpPost;
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
import org.apache.hc.client5.http.impl.classic.CloseableHttpResponse;
import org.apache.hc.client5.http.impl.classic.HttpClients;
import org.apache.hc.core5.http.ContentType;
import org.apache.hc.core5.http.HttpEntity;
import org.apache.hc.core5.http.HttpHeaders;
import org.apache.hc.core5.http.ParseException;
import org.apache.hc.core5.http.io.entity.EntityUtils;
import org.apache.hc.core5.http.io.entity.StringEntity;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

class Example {
  private static final String API_KEY = "YOUR_API_KEY";

  public static void main(final String[] args)
      throws InterruptedException, IOException, ParseException {
    Map<String, Object> parameters =
        ImmutableMap.of(
            "url",
            "https://www.whatismybrowser.com/detect/is-javascript-enabled",
            "browserHtml",
            true,
            "javascript",
            false);
    String requestBody = new Gson().toJson(parameters);

    HttpPost request = new HttpPost("https://api.zyte.com/v1/extract");
    request.setHeader(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON);
    request.setHeader(HttpHeaders.ACCEPT_ENCODING, "gzip, deflate");
    request.setHeader(HttpHeaders.AUTHORIZATION, buildAuthHeader());
    request.setEntity(new StringEntity(requestBody));

    try (CloseableHttpClient client = HttpClients.createDefault()) {
      try (CloseableHttpResponse response = client.execute(request)) {
        HttpEntity entity = response.getEntity();
        String apiResponse = EntityUtils.toString(entity, StandardCharsets.UTF_8);
        JsonObject jsonObject = JsonParser.parseString(apiResponse).getAsJsonObject();
        String browserHtml = jsonObject.get("browserHtml").getAsString();
        Document document = Jsoup.parse(browserHtml);
        String isJavaScriptEnabled = document.select("#detected_value").text();
      }
    }
  }

  private static String buildAuthHeader() {
    String auth = API_KEY + ":";
    String encodedAuth = Base64.getEncoder().encodeToString(auth.getBytes());
    return "Basic " + encodedAuth;
  }
}
const axios = require('axios')
const cheerio = require('cheerio')

axios.post(
    'https://api.zyte.com/v1/extract',
    {
      url: 'https://www.whatismybrowser.com/detect/is-javascript-enabled',
      browserHtml: true,
      javascript: false
    },
    {
      auth: { username: 'YOUR_API_KEY' },
      headers: { 'Accept-Encoding': 'gzip, deflate' }
    }
  ).then((response) => {
    const $ = cheerio.load(response.data.browserHtml)
    const isJavaScriptEnabled = $('#detected_value').text()
  })
<?php

$client = new GuzzleHttp\Client();
$response = $client->request('POST', 'https://api.zyte.com/v1/extract', [
    'auth' => ['YOUR_API_KEY', ''],
    'headers' => ['Accept-Encoding' => 'gzip'],
    'json' => [
        'url' => 'https://www.whatismybrowser.com/detect/is-javascript-enabled',
        'browserHtml' => true,
        'javascript' => false,
    ],
]);
$api = json_decode($response->getBody());
$doc = new DOMDocument();
$doc->loadHTML($api->browserHtml);
$xpath = new DOMXPath($doc);
$is_javascript_enabled = $xpath->query("//*[@id='detected_value']")->item(0)->textContent;
import requests
from parsel import Selector

api_response = requests.post(
    'https://api.zyte.com/v1/extract',
    auth=('YOUR_API_KEY', ''),
    json={
        'url': 'https://www.whatismybrowser.com/detect/is-javascript-enabled',
        'browserHtml': True,
        'javascript': False,
    },
)
browser_html = api_response.json()['browserHtml']
selector = Selector(browser_html)
is_javascript_enabled: str = selector.css('#detected_value::text').get()
import asyncio

from parsel import Selector
from zyte_api.aio.client import AsyncClient

async def main():
    client = AsyncClient()
    api_response = await client.request_raw(
        {
            'url': 'https://www.whatismybrowser.com/detect/is-javascript-enabled',
            'browserHtml': True,
            'javascript': False,
        }
    )
    browser_html = api_response['browserHtml']
    selector = Selector(browser_html)
    is_javascript_enabled: str = selector.css('#detected_value::text').get()

asyncio.run(main())
from scrapy import Request, Spider


class HTTPBinOrgSpider(Spider):
    name = "httpbin_org"

    def start_requests(self):
        yield Request(
            "https://www.whatismybrowser.com/detect/is-javascript-enabled",
            meta={
                "zyte_api_automap": {
                    "browserHtml": True,
                    "javascript": False,
                },
            },
        )

    def parse(self, response):
        is_javascript_enabled: str = response.css("#detected_value::text").get()

Extract a response body#

Extracting a response body allows you to:

  • Get faster response times.

  • Reduce costs.

  • Download non-HTML content (e.g. JSON, XML), including binary content (e.g. images).

  • Set a request method, body, and arbitrary headers.

To extract a response body, set the httpResponseBody key in your API request body to true.

The httpResponseBody key of the response JSON object is the Base64-encoded response body.

using System.Collections.Generic;
using System.Net;
using System.Net.Http;
using System.Text;
using System.Text.Json;
using System.Threading.Tasks;

HttpClientHandler handler = new HttpClientHandler()
{
    AutomaticDecompression = DecompressionMethods.All
};
HttpClient client = new HttpClient(handler);

var apiKey = "YOUR_API_KEY";
var bytes = Encoding.GetEncoding("ISO-8859-1").GetBytes(apiKey + ":");
var auth = System.Convert.ToBase64String(bytes);
client.DefaultRequestHeaders.Add("Authorization", "Basic " + auth);

client.DefaultRequestHeaders.Add("Accept-Encoding", "br, gzip, deflate");

var input = new Dictionary<string, object>(){
    {"url", "https://toscrape.com"},
    {"httpResponseBody", true}
};
var inputJson = JsonSerializer.Serialize(input);
var content = new StringContent(inputJson, Encoding.UTF8, "application/json");

HttpResponseMessage response = await client.PostAsync("https://api.zyte.com/v1/extract", content);
var body = await response.Content.ReadAsByteArrayAsync();

var data = JsonDocument.Parse(body);
var base64HttpResponseBody = data.RootElement.GetProperty("httpResponseBody").ToString();
var httpResponseBody = System.Convert.FromBase64String(base64HttpResponseBody);
input.json#
{"url": "https://toscrape.com", "httpResponseBody": true}
curl \
    --user YOUR_API_KEY: \
    --header 'Content-Type: application/json' \
    --data @input.json \
    --compressed \
    https://api.zyte.com/v1/extract \
| jq --raw-output .httpResponseBody \
| base64 --decode \
> output.html
input.jsonl#
{"url": "https://toscrape.com", "httpResponseBody": true}
zyte-api input.jsonl 2> /dev/null \
| jq --raw-output .httpResponseBody \
| base64 --decode \
> output.html
import com.google.common.collect.ImmutableMap;
import com.google.gson.Gson;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Base64;
import java.util.Map;
import org.apache.hc.client5.http.classic.methods.HttpPost;
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
import org.apache.hc.client5.http.impl.classic.CloseableHttpResponse;
import org.apache.hc.client5.http.impl.classic.HttpClients;
import org.apache.hc.core5.http.ContentType;
import org.apache.hc.core5.http.HttpEntity;
import org.apache.hc.core5.http.HttpHeaders;
import org.apache.hc.core5.http.ParseException;
import org.apache.hc.core5.http.io.entity.EntityUtils;
import org.apache.hc.core5.http.io.entity.StringEntity;

class Example {
  private static final String API_KEY = "YOUR_API_KEY";

  public static void main(final String[] args)
      throws InterruptedException, IOException, ParseException {
    Map<String, Object> parameters =
        ImmutableMap.of("url", "https://toscrape.com", "httpResponseBody", true);
    String requestBody = new Gson().toJson(parameters);

    HttpPost request = new HttpPost("https://api.zyte.com/v1/extract");
    request.setHeader(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON);
    request.setHeader(HttpHeaders.ACCEPT_ENCODING, "gzip, deflate");
    request.setHeader(HttpHeaders.AUTHORIZATION, buildAuthHeader());
    request.setEntity(new StringEntity(requestBody));

    try (CloseableHttpClient client = HttpClients.createDefault()) {
      try (CloseableHttpResponse response = client.execute(request)) {
        HttpEntity entity = response.getEntity();
        String apiResponse = EntityUtils.toString(entity, StandardCharsets.UTF_8);
        JsonObject jsonObject = JsonParser.parseString(apiResponse).getAsJsonObject();
        String base64HttpResponseBody = jsonObject.get("httpResponseBody").getAsString();
        byte[] httpResponseBodyBytes = Base64.getDecoder().decode(base64HttpResponseBody);
        String httpResponseBody = new String(httpResponseBodyBytes, StandardCharsets.UTF_8);
      }
    }
  }

  private static String buildAuthHeader() {
    String auth = API_KEY + ":";
    String encodedAuth = Base64.getEncoder().encodeToString(auth.getBytes());
    return "Basic " + encodedAuth;
  }
}
const axios = require('axios')

axios.post(
    'https://api.zyte.com/v1/extract',
    {
      url: 'https://toscrape.com',
      httpResponseBody: true
    },
    {
      auth: { username: 'YOUR_API_KEY' },
      headers: { 'Accept-Encoding': 'gzip, deflate' }
    }
  ).then((response) => {
    const httpResponseBody = Buffer.from(
      response.data.httpResponseBody,
      'base64'
    )
  })
<?php

$client = new GuzzleHttp\Client();
$response = $client->request('POST', 'https://api.zyte.com/v1/extract', [
    'auth' => ['YOUR_API_KEY', ''],
    'headers' => ['Accept-Encoding' => 'gzip'],
    'json' => [
        'url' => 'https://toscrape.com',
        'httpResponseBody' => true,
    ],
]);
$data = json_decode($response->getBody());
$http_response_body = base64_decode($data->httpResponseBody);
from base64 import b64decode

import requests

api_response = requests.post(
    'https://api.zyte.com/v1/extract',
    auth=('YOUR_API_KEY', ''),
    json={
        'url': 'https://toscrape.com',
        'httpResponseBody': True,
    },
)
http_response_body: bytes = b64decode(
    api_response.json()['httpResponseBody']
)
import asyncio
from base64 import b64decode

from zyte_api.aio.client import AsyncClient

async def main():
    client = AsyncClient()
    api_response = await client.request_raw(
        {
            'url': 'https://toscrape.com',
            'httpResponseBody': True,
        }
    )
    http_response_body: bytes = b64decode(
        api_response['httpResponseBody']
    )

asyncio.run(main())

In transparent mode, when you target a text resource (e.g. HTML, JSON), regular Scrapy requests work out of the box:

from scrapy import Spider


class ToScrapeSpider(Spider):
    name = "toscrape_com"
    start_urls = ["https://toscrape.com"]

    def parse(self, response):
        http_response_text: str = response.text

While regular Scrapy requests also work for binary responses at the moment, they may stop working in future versions of scrapy-zyte-api, so passing httpResponseBody is recommended when targeting binary resources:

from scrapy import Request, Spider


class ToScrapeSpider(Spider):
    name = "toscrape_com"

    def start_requests(self):
        yield Request(
            "https://toscrape.com",
            meta={
                "zyte_api_automap": {
                    "httpResponseBody": True,
                },
            },
        )

    def parse(self, response):
        http_response_body: bytes = response.body

If your response body is HTML, see Decode HTML.

Set a request method#

Response body extraction uses a GET request by default.

Use the httpRequestMethod key in your API request body to switch the request method to a different value: POST, PUT, DELETE, OPTIONS, TRACE, PATCH.

using System.Collections.Generic;
using System.Net;
using System.Net.Http;
using System.Text;
using System.Text.Json;
using System.Threading.Tasks;

HttpClientHandler handler = new HttpClientHandler()
{
    AutomaticDecompression = DecompressionMethods.All
};
HttpClient client = new HttpClient(handler);

var apiKey = "YOUR_API_KEY";
var bytes = Encoding.GetEncoding("ISO-8859-1").GetBytes(apiKey + ":");
var auth = System.Convert.ToBase64String(bytes);
client.DefaultRequestHeaders.Add("Authorization", "Basic " + auth);

client.DefaultRequestHeaders.Add("Accept-Encoding", "br, gzip, deflate");

var input = new Dictionary<string, object>(){
    {"url", "https://httpbin.org/anything"},
    {"httpResponseBody", true},
    {"httpRequestMethod", "POST"}
};
var inputJson = JsonSerializer.Serialize(input);
var content = new StringContent(inputJson, Encoding.UTF8, "application/json");

HttpResponseMessage response = await client.PostAsync("https://api.zyte.com/v1/extract", content);
var body = await response.Content.ReadAsByteArrayAsync();

var data = JsonDocument.Parse(body);
var base64HttpResponseBody = data.RootElement.GetProperty("httpResponseBody").ToString();
var httpResponseBody = System.Convert.FromBase64String(base64HttpResponseBody);

var responseData = JsonDocument.Parse(httpResponseBody);
var method = responseData.RootElement.GetProperty("method").ToString();
input.json#
{"url": "https://httpbin.org/anything", "httpResponseBody": true, "httpRequestMethod": "POST"}
curl \
    --user YOUR_API_KEY: \
    --header 'Content-Type: application/json' \
    --data @input.json \
    --compressed \
    https://api.zyte.com/v1/extract \
| jq --raw-output .httpResponseBody \
| base64 --decode \
| jq .method
input.jsonl#
{"url": "https://httpbin.org/anything", "httpResponseBody": true, "httpRequestMethod": "POST"}
zyte-api input.jsonl 2> /dev/null \
| jq --raw-output .httpResponseBody \
| base64 --decode \
| jq .method
import com.google.common.collect.ImmutableMap;
import com.google.gson.Gson;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Base64;
import java.util.Map;
import org.apache.hc.client5.http.classic.methods.HttpPost;
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
import org.apache.hc.client5.http.impl.classic.CloseableHttpResponse;
import org.apache.hc.client5.http.impl.classic.HttpClients;
import org.apache.hc.core5.http.ContentType;
import org.apache.hc.core5.http.HttpEntity;
import org.apache.hc.core5.http.HttpHeaders;
import org.apache.hc.core5.http.ParseException;
import org.apache.hc.core5.http.io.entity.EntityUtils;
import org.apache.hc.core5.http.io.entity.StringEntity;

class Example {
  private static final String API_KEY = "YOUR_API_KEY";

  public static void main(final String[] args)
      throws InterruptedException, IOException, ParseException {
    Map<String, Object> parameters =
        ImmutableMap.of(
            "url",
            "https://httpbin.org/anything",
            "httpResponseBody",
            true,
            "httpRequestMethod",
            "POST");
    String requestBody = new Gson().toJson(parameters);

    HttpPost request = new HttpPost("https://api.zyte.com/v1/extract");
    request.setHeader(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON);
    request.setHeader(HttpHeaders.ACCEPT_ENCODING, "gzip, deflate");
    request.setHeader(HttpHeaders.AUTHORIZATION, buildAuthHeader());
    request.setEntity(new StringEntity(requestBody));

    try (CloseableHttpClient client = HttpClients.createDefault()) {
      try (CloseableHttpResponse response = client.execute(request)) {
        HttpEntity entity = response.getEntity();
        String apiResponse = EntityUtils.toString(entity, StandardCharsets.UTF_8);
        JsonObject jsonObject = JsonParser.parseString(apiResponse).getAsJsonObject();
        String base64HttpResponseBody = jsonObject.get("httpResponseBody").getAsString();
        byte[] httpResponseBodyBytes = Base64.getDecoder().decode(base64HttpResponseBody);
        String httpResponseBody = new String(httpResponseBodyBytes, StandardCharsets.UTF_8);
        JsonObject data = JsonParser.parseString(httpResponseBody).getAsJsonObject();
        String method = data.get("method").getAsString();
      }
    }
  }

  private static String buildAuthHeader() {
    String auth = API_KEY + ":";
    String encodedAuth = Base64.getEncoder().encodeToString(auth.getBytes());
    return "Basic " + encodedAuth;
  }
}
const axios = require('axios')

axios.post(
    'https://api.zyte.com/v1/extract',
    {
      url: 'https://httpbin.org/anything',
      httpResponseBody: true,
      httpRequestMethod: 'POST'
    },
    {
      auth: { username: 'YOUR_API_KEY' },
      headers: { 'Accept-Encoding': 'gzip, deflate' }
    }
  ).then((response) => {
    const httpResponseBody = Buffer.from(
      response.data.httpResponseBody,
      'base64'
    )
    const method = JSON.parse(httpResponseBody).method
  })
<?php

$client = new GuzzleHttp\Client();
$response = $client->request('POST', 'https://api.zyte.com/v1/extract', [
    'auth' => ['YOUR_API_KEY', ''],
    'headers' => ['Accept-Encoding' => 'gzip'],
    'json' => [
        'url' => 'https://httpbin.org/anything',
        'httpResponseBody' => true,
        'httpRequestMethod' => 'POST',
    ],
]);
$data = json_decode($response->getBody());
$http_response_body = base64_decode($data->httpResponseBody);
$method = json_decode($http_response_body)->method;
import json
from base64 import b64decode

import requests

api_response = requests.post(
    'https://api.zyte.com/v1/extract',
    auth=('YOUR_API_KEY', ''),
    json={
        'url': 'https://httpbin.org/anything',
        'httpResponseBody': True,
        'httpRequestMethod': 'POST',
    },
)
http_response_body = b64decode(
    api_response.json()['httpResponseBody']
)
method = json.loads(http_response_body)['method']
import asyncio
import json
from base64 import b64decode

from zyte_api.aio.client import AsyncClient

async def main():
    client = AsyncClient()
    api_response = await client.request_raw(
        {
            'url': 'https://httpbin.org/anything',
            'httpResponseBody': True,
            'httpRequestMethod': 'POST',
        }
    )
    http_response_body: bytes = b64decode(
        api_response['httpResponseBody']
    )
    method = json.loads(http_response_body)['method']

asyncio.run(main())
import json

from scrapy import Request, Spider


class HTTPBinOrgSpider(Spider):
    name = "httpbin_org"

    def start_requests(self):
        yield Request(
            "https://httpbin.org/anything",
            method="POST",
        )

    def parse(self, response):
        method = json.loads(response.text)["method"]

The HEAD and CONNECT request methods are not supported.

Set a request body#

If you use a different request method, you may also need to set a request body.

Use the httpRequestBody key in the body of your API request to set a body for the extraction request. The value must be a Base64-encoded representation of the body bytes.

using System.Collections.Generic;
using System.Net;
using System.Net.Http;
using System.Text;
using System.Text.Json;
using System.Threading.Tasks;

HttpClientHandler handler = new HttpClientHandler()
{
    AutomaticDecompression = DecompressionMethods.All
};
HttpClient client = new HttpClient(handler);

var apiKey = "YOUR_API_KEY";
var bytes = Encoding.GetEncoding("ISO-8859-1").GetBytes(apiKey + ":");
var auth = System.Convert.ToBase64String(bytes);
client.DefaultRequestHeaders.Add("Authorization", "Basic " + auth);

client.DefaultRequestHeaders.Add("Accept-Encoding", "br, gzip, deflate");

var requestBodyBytes = Encoding.GetEncoding("ISO-8859-1").GetBytes("foo");
var base64RequestBody = System.Convert.ToBase64String(requestBodyBytes);
var input = new Dictionary<string, object>(){
    {"url", "https://httpbin.org/anything"},
    {"httpResponseBody", true},
    {"httpRequestMethod", "POST"},
    {"httpRequestBody", base64RequestBody}
};
var inputJson = JsonSerializer.Serialize(input);
var content = new StringContent(inputJson, Encoding.UTF8, "application/json");

HttpResponseMessage response = await client.PostAsync("https://api.zyte.com/v1/extract", content);
var body = await response.Content.ReadAsByteArrayAsync();

var data = JsonDocument.Parse(body);
var base64HttpResponseBody = data.RootElement.GetProperty("httpResponseBody").ToString();
var httpResponseBody = System.Convert.FromBase64String(base64HttpResponseBody);

var responseData = JsonDocument.Parse(httpResponseBody);
var requestBody = responseData.RootElement.GetProperty("data").ToString();
input.json#
{
    "url": "https://httpbin.org/anything",
    "httpResponseBody": true,
    "httpRequestMethod": "POST",
    "httpRequestBody": "Zm9vCg=="
}
curl \
    --user YOUR_API_KEY: \
    --header 'Content-Type: application/json' \
    --data @input.json \
    --compressed \
    https://api.zyte.com/v1/extract \
| jq --raw-output .httpResponseBody \
| base64 --decode \
| jq --raw-output .data
input.jsonl#
{"url": "https://httpbin.org/anything", "httpResponseBody": true, "httpRequestMethod": "POST", "httpRequestBody": "Zm9vCg=="}
zyte-api input.jsonl 2> /dev/null \
| jq --raw-output .httpResponseBody \
| base64 --decode \
| jq --raw-output .data
import com.google.common.collect.ImmutableMap;
import com.google.gson.Gson;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Base64;
import java.util.Map;
import org.apache.hc.client5.http.classic.methods.HttpPost;
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
import org.apache.hc.client5.http.impl.classic.CloseableHttpResponse;
import org.apache.hc.client5.http.impl.classic.HttpClients;
import org.apache.hc.core5.http.ContentType;
import org.apache.hc.core5.http.HttpEntity;
import org.apache.hc.core5.http.HttpHeaders;
import org.apache.hc.core5.http.ParseException;
import org.apache.hc.core5.http.io.entity.EntityUtils;
import org.apache.hc.core5.http.io.entity.StringEntity;

class Example {
  private static final String API_KEY = "YOUR_API_KEY";

  public static void main(final String[] args)
      throws InterruptedException, IOException, ParseException {
    String base64Foo = Base64.getEncoder().encodeToString("foo".getBytes());
    Map<String, Object> parameters =
        ImmutableMap.of(
            "url",
            "https://httpbin.org/anything",
            "httpResponseBody",
            true,
            "httpRequestMethod",
            "POST",
            "httpRequestBody",
            base64Foo);
    String requestBody = new Gson().toJson(parameters);

    HttpPost request = new HttpPost("https://api.zyte.com/v1/extract");
    request.setHeader(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON);
    request.setHeader(HttpHeaders.ACCEPT_ENCODING, "gzip, deflate");
    request.setHeader(HttpHeaders.AUTHORIZATION, buildAuthHeader());
    request.setEntity(new StringEntity(requestBody));

    try (CloseableHttpClient client = HttpClients.createDefault()) {
      try (CloseableHttpResponse response = client.execute(request)) {
        HttpEntity entity = response.getEntity();
        String apiResponse = EntityUtils.toString(entity, StandardCharsets.UTF_8);
        JsonObject jsonObject = JsonParser.parseString(apiResponse).getAsJsonObject();
        String base64HttpResponseBody = jsonObject.get("httpResponseBody").getAsString();
        byte[] httpResponseBodyBytes = Base64.getDecoder().decode(base64HttpResponseBody);
        String httpResponseBody = new String(httpResponseBodyBytes, StandardCharsets.UTF_8);
        JsonObject data = JsonParser.parseString(httpResponseBody).getAsJsonObject();
        String body = data.get("data").getAsString();
      }
    }
  }

  private static String buildAuthHeader() {
    String auth = API_KEY + ":";
    String encodedAuth = Base64.getEncoder().encodeToString(auth.getBytes());
    return "Basic " + encodedAuth;
  }
}
const axios = require('axios')

axios.post(
    'https://api.zyte.com/v1/extract',
    {
      url: 'https://httpbin.org/anything',
      httpResponseBody: true,
      httpRequestMethod: 'POST',
      httpRequestBody: Buffer.from('foo').toString('base64')
    },
    {
      auth: { username: 'YOUR_API_KEY' },
      headers: { 'Accept-Encoding': 'gzip, deflate' }
    }
  ).then((response) => {
    const httpResponseBody = Buffer.from(
      response.data.httpResponseBody,
      'base64'
    )
    const requestBody = JSON.parse(httpResponseBody).data
  })
<?php

$client = new GuzzleHttp\Client();
$response = $client->request('POST', 'https://api.zyte.com/v1/extract', [
    'auth' => ['YOUR_API_KEY', ''],
    'headers' => ['Accept-Encoding' => 'gzip'],
    'json' => [
        'url' => 'https://httpbin.org/anything',
        'httpResponseBody' => true,
        'httpRequestMethod' => 'POST',
        'httpRequestBody' => base64_encode('foo'),
    ],
]);
$data = json_decode($response->getBody());
$http_response_body = base64_decode($data->httpResponseBody);
$request_body = json_decode($http_response_body)->data;
import json
from base64 import b64decode, b64encode

import requests

api_response = requests.post(
    'https://api.zyte.com/v1/extract',
    auth=('YOUR_API_KEY', ''),
    json={
        'url': 'https://httpbin.org/anything',
        'httpResponseBody': True,
        'httpRequestMethod': 'POST',
        'httpRequestBody': b64encode(b'foo').decode(),
    },
)
http_response_body = b64decode(
    api_response.json()['httpResponseBody']
)
request_body: str = json.loads(http_response_body)['data']
import asyncio
import json
from base64 import b64decode, b64encode

from zyte_api.aio.client import AsyncClient

async def main():
    client = AsyncClient()
    api_response = await client.request_raw(
        {
            'url': 'https://httpbin.org/anything',
            'httpResponseBody': True,
            'httpRequestMethod': 'POST',
            'httpRequestBody': b64encode(b'foo').decode(),
        }
    )
    http_response_body: bytes = b64decode(
        api_response['httpResponseBody']
    )
    request_body: str = json.loads(http_response_body)['data']

asyncio.run(main())
import json

from scrapy import Request, Spider


class HTTPBinOrgSpider(Spider):
    name = "httpbin_org"

    def start_requests(self):
        yield Request(
            "https://httpbin.org/anything",
            method="POST",
            body=b"foo",
        )

    def parse(self, response):
        request_body = json.loads(response.body)["data"]

Set request headers#

When extracting a response body, you can set the customHttpRequestHeaders key in your API request body to an array of objects with name and value keys representing headers to include in your request.

using System.Collections.Generic;
using System.Net;
using System.Net.Http;
using System.Text;
using System.Text.Json;
using System.Threading.Tasks;

HttpClientHandler handler = new HttpClientHandler()
{
    AutomaticDecompression = DecompressionMethods.All
};
HttpClient client = new HttpClient(handler);

var apiKey = "YOUR_API_KEY";
var bytes = Encoding.GetEncoding("ISO-8859-1").GetBytes(apiKey + ":");
var auth = System.Convert.ToBase64String(bytes);
client.DefaultRequestHeaders.Add("Authorization", "Basic " + auth);

client.DefaultRequestHeaders.Add("Accept-Encoding", "br, gzip, deflate");

var input = new Dictionary<string, object>(){
    {"url", "https://httpbin.org/anything"},
    {"httpResponseBody", true},
    {
        "customHttpRequestHeaders",
        new List<Dictionary<string, object>>()
        {
            new Dictionary<string, object>()
            {
                {"name", "Accept-Language"},
                {"value", "fa"}
            }
        }
    }
};
var inputJson = JsonSerializer.Serialize(input);
var content = new StringContent(inputJson, Encoding.UTF8, "application/json");

HttpResponseMessage response = await client.PostAsync("https://api.zyte.com/v1/extract", content);
var body = await response.Content.ReadAsByteArrayAsync();

var data = JsonDocument.Parse(body);
var base64HttpResponseBody = data.RootElement.GetProperty("httpResponseBody").ToString();
var httpResponseBody = System.Convert.FromBase64String(base64HttpResponseBody);

var responseData = JsonDocument.Parse(httpResponseBody);
var headerEnumerator = responseData.RootElement.GetProperty("headers").EnumerateObject();
var headers = new Dictionary<string, string>();
while (headerEnumerator.MoveNext())
{
    headers.Add(
        headerEnumerator.Current.Name.ToString(),
        headerEnumerator.Current.Value.ToString()
    );
}
input.json#
{
    "url": "https://httpbin.org/anything",
    "httpResponseBody": true,
    "customHttpRequestHeaders": [
        {
            "name": "Accept-Language",
            "value": "fa"
        }
    ]
}
curl \
    --user YOUR_API_KEY: \
    --header 'Content-Type: application/json' \
    --data @input.json \
    --compressed \
    https://api.zyte.com/v1/extract \
| jq --raw-output .httpResponseBody \
| base64 --decode \
| jq .headers
input.jsonl#
{"url": "https://httpbin.org/anything", "httpResponseBody": true, "customHttpRequestHeaders": [{"name": "Accept-Language", "value": "fa"}]}
zyte-api input.jsonl 2> /dev/null \
| jq --raw-output .httpResponseBody \
| base64 --decode \
| jq .headers
import com.google.common.collect.ImmutableMap;
import com.google.gson.Gson;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Base64;
import java.util.Collections;
import java.util.Map;
import org.apache.hc.client5.http.classic.methods.HttpPost;
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
import org.apache.hc.client5.http.impl.classic.CloseableHttpResponse;
import org.apache.hc.client5.http.impl.classic.HttpClients;
import org.apache.hc.core5.http.ContentType;
import org.apache.hc.core5.http.HttpEntity;
import org.apache.hc.core5.http.HttpHeaders;
import org.apache.hc.core5.http.ParseException;
import org.apache.hc.core5.http.io.entity.EntityUtils;
import org.apache.hc.core5.http.io.entity.StringEntity;

class Example {
  private static final String API_KEY = "YOUR_API_KEY";

  public static void main(final String[] args)
      throws InterruptedException, IOException, ParseException {
    Map<String, Object> customHttpRequestHeader =
        ImmutableMap.of("name", "Accept-Language", "value", "fa");
    Map<String, Object> parameters =
        ImmutableMap.of(
            "url",
            "https://httpbin.org/anything",
            "httpResponseBody",
            true,
            "customHttpRequestHeaders",
            Collections.singletonList(customHttpRequestHeader));
    String requestBody = new Gson().toJson(parameters);

    HttpPost request = new HttpPost("https://api.zyte.com/v1/extract");
    request.setHeader(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON);
    request.setHeader(HttpHeaders.ACCEPT_ENCODING, "gzip, deflate");
    request.setHeader(HttpHeaders.AUTHORIZATION, buildAuthHeader());
    request.setEntity(new StringEntity(requestBody));

    try (CloseableHttpClient client = HttpClients.createDefault()) {
      try (CloseableHttpResponse response = client.execute(request)) {
        HttpEntity entity = response.getEntity();
        String apiResponse = EntityUtils.toString(entity, StandardCharsets.UTF_8);
        JsonObject jsonObject = JsonParser.parseString(apiResponse).getAsJsonObject();
        String base64HttpResponseBody = jsonObject.get("httpResponseBody").getAsString();
        byte[] httpResponseBodyBytes = Base64.getDecoder().decode(base64HttpResponseBody);
        String httpResponseBody = new String(httpResponseBodyBytes, StandardCharsets.UTF_8);
        JsonObject data = JsonParser.parseString(httpResponseBody).getAsJsonObject();
        JsonObject headers = data.get("headers").getAsJsonObject();
      }
    }
  }

  private static String buildAuthHeader() {
    String auth = API_KEY + ":";
    String encodedAuth = Base64.getEncoder().encodeToString(auth.getBytes());
    return "Basic " + encodedAuth;
  }
}
const axios = require('axios')

axios.post(
    'https://api.zyte.com/v1/extract',
    {
      url: 'https://httpbin.org/anything',
      httpResponseBody: true,
      customHttpRequestHeaders: [
        {
          name: 'Accept-Language',
          value: 'fa'
        }
      ]
    },
    {
      auth: { username: 'YOUR_API_KEY' },
      headers: { 'Accept-Encoding': 'gzip, deflate' }
    }
  ).then((response) => {
    const httpResponseBody = Buffer.from(
      response.data.httpResponseBody,
      'base64'
    )
    const headers = JSON.parse(httpResponseBody).headers
  })
<?php

$client = new GuzzleHttp\Client();
$response = $client->request('POST', 'https://api.zyte.com/v1/extract', [
    'auth' => ['YOUR_API_KEY', ''],
    'headers' => ['Accept-Encoding' => 'gzip'],
    'json' => [
        'url' => 'https://httpbin.org/anything',
        'httpResponseBody' => true,
        'customHttpRequestHeaders' => [
            [
                'name' => 'Accept-Language',
                'value' => 'fa',
            ],
        ],
    ],
]);
$api = json_decode($response->getBody());
$http_response_body = base64_decode($api->httpResponseBody);
$data = json_decode($http_response_body);
$headers = $data->headers;
import json
from base64 import b64decode

import requests

api_response = requests.post(
    'https://api.zyte.com/v1/extract',
    auth=('YOUR_API_KEY', ''),
    json={
        'url': 'https://httpbin.org/anything',
        'httpResponseBody': True,
        'customHttpRequestHeaders': [
            {
                'name': 'Accept-Language',
                'value': 'fa',
            },
        ],
    },
)
http_response_body = b64decode(
    api_response.json()['httpResponseBody']
)
headers = json.loads(http_response_body)['headers']
import asyncio
import json
from base64 import b64decode

from zyte_api.aio.client import AsyncClient

async def main():
    client = AsyncClient()
    api_response = await client.request_raw(
        {
            'url': 'https://httpbin.org/anything',
            'httpResponseBody': True,
            'customHttpRequestHeaders': [
                {
                    'name': 'Accept-Language',
                    'value': 'fa',
                },
            ],
        }
    )
    http_response_body: bytes = b64decode(
        api_response['httpResponseBody']
    )
    headers = json.loads(http_response_body)['headers']

asyncio.run(main())
import json

from scrapy import Request, Spider


class HTTPBinOrgSpider(Spider):
    name = "httpbin_org"

    def start_requests(self):
        yield Request(
            "https://httpbin.org/anything",
            headers={"Accept-Language": "fa"},
        )

    def parse(self, response):
        headers = json.loads(response.text)["headers"]

Zyte API sends some headers automatically. In case of conflict, your custom headers will usually override Zyte API headers. However, Zyte API may silently override or drop some of your custom headers to reduce the chance of your request being banned. For example, you can never set custom Cookie or User-Agent headers.

If you set multiple headers with the same name, only the last header value will be sent. To overcome this limitation, join the header values with a comma into a single header value. For example, replace "customHttpRequestHeaders": [{"name": "foo", "value": "bar"}, {"name": "foo", "value": "baz"}] with "customHttpRequestHeaders": [{"name": "foo", "value": "bar,baz"}].

Decode HTML#

While browser HTML is provided pre-decoded, as a string, HTML extracted as a response body needs to be decoded.

HTML content can be encoded with one of many character encodings, and you must determine the character encoding used so that you can decode that HTML content accordingly.

The best way to determine the encoding of HTML content is to follow the encoding sniffing algorithm defined in the HTML standard.

In addition to the HTML content, the HTML encoding sniffing algorithm takes into account any character encoding provided in the optional charset parameter of media types declared in the Content-Type response header, so make sure you get the response headers in addition to the response body if you are following the HTML encoding sniffing algorithm.

Use file to find the media type of a previously-downloaded response based solely on its body (i.e. not following the HTML encoding sniffing algorithm).

file --mime-encoding output.html

Use content-type-parser, html-encoding-sniffer and whatwg-encoding:

const contentTypeParser = require('content-type-parser')
const htmlEncodingSniffer = require('html-encoding-sniffer')
const whatwgEncoding = require('whatwg-encoding')

// …

const httpResponseHeaders = response.data.httpResponseHeaders
let contentTypeCharset
httpResponseHeaders.forEach(function (item) {
  if (item.name.toLowerCase() === 'content-type') {
    contentTypeCharset = contentTypeParser(item.value).get('charset')
  }
})
const httpResponseBody = Buffer.from(response.data.httpResponseBody, 'base64')
const encoding = htmlEncodingSniffer(httpResponseBody, {
  transportLayerEncodingLabel: contentTypeCharset
})
const html = whatwgEncoding.decode(httpResponseBody, encoding)

web-poet provides a response wrapper that automatically decodes the response body following an encoding sniffing algorithm similar to the one defined in the HTML standard.

Provided that you have extracted a response with both body and headers, and you have Base64-decoded the response body, you can decode the HTML bytes as follows:

from web_poet import HttpResponse

# …

headers = tuple(
    (item['name'], item['value'])
    for item in http_response_headers
)
response = HttpResponse(
    url='https://example.com',
    body=http_response_body,
    status=200,
    headers=headers,
)
html = response.text

In transparent mode, regular Scrapy requests targeting HTML resources decode them by default. See Extract a response body.

Extract response headers#

Set the httpResponseHeaders key in your API request body to true to extract response headers.

When you do, the Zyte API response includes an httpResponseHeaders key with the headers as an array of objects with name and value keys.

using System.Collections.Generic;
using System.Net;
using System.Net.Http;
using System.Text;
using System.Text.Json;
using System.Threading.Tasks;

HttpClientHandler handler = new HttpClientHandler()
{
    AutomaticDecompression = DecompressionMethods.All
};
HttpClient client = new HttpClient(handler);

var apiKey = "YOUR_API_KEY";
var bytes = Encoding.GetEncoding("ISO-8859-1").GetBytes(apiKey + ":");
var auth = System.Convert.ToBase64String(bytes);
client.DefaultRequestHeaders.Add("Authorization", "Basic " + auth);

client.DefaultRequestHeaders.Add("Accept-Encoding", "br, gzip, deflate");

var input = new Dictionary<string, object>(){
    {"url", "https://toscrape.com"},
    {"httpResponseHeaders", true}
};
var inputJson = JsonSerializer.Serialize(input);
var content = new StringContent(inputJson, Encoding.UTF8, "application/json");

HttpResponseMessage response = await client.PostAsync("https://api.zyte.com/v1/extract", content);
var body = await response.Content.ReadAsByteArrayAsync();

var data = JsonDocument.Parse(body);
var headerEnumerator = data.RootElement.GetProperty("httpResponseHeaders").EnumerateArray();
var headers = new Dictionary<string, string>();
while (headerEnumerator.MoveNext())
{
    headers.Add(
        headerEnumerator.Current.GetProperty("name").ToString(),
        headerEnumerator.Current.GetProperty("value").ToString()
    );
}
input.json#
{"url": "https://toscrape.com", "httpResponseHeaders": true}
curl \
    --user YOUR_API_KEY: \
    --header 'Content-Type: application/json' \
    --data @input.json \
    --compressed \
    https://api.zyte.com/v1/extract \
| jq .httpResponseHeaders
input.jsonl#
{"url": "https://toscrape.com", "httpResponseHeaders": true}
zyte-api input.jsonl 2> /dev/null \
| jq .httpResponseHeaders
import com.google.common.collect.ImmutableMap;
import com.google.gson.Gson;
import com.google.gson.JsonArray;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Base64;
import java.util.Map;
import org.apache.hc.client5.http.classic.methods.HttpPost;
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
import org.apache.hc.client5.http.impl.classic.CloseableHttpResponse;
import org.apache.hc.client5.http.impl.classic.HttpClients;
import org.apache.hc.core5.http.ContentType;
import org.apache.hc.core5.http.HttpEntity;
import org.apache.hc.core5.http.HttpHeaders;
import org.apache.hc.core5.http.ParseException;
import org.apache.hc.core5.http.io.entity.EntityUtils;
import org.apache.hc.core5.http.io.entity.StringEntity;

class Example {
  private static final String API_KEY = "YOUR_API_KEY";

  public static void main(final String[] args)
      throws InterruptedException, IOException, ParseException {
    Map<String, Object> parameters =
        ImmutableMap.of(
            "url", "https://toscrape.com", "browserHtml", true, "httpResponseHeaders", true);
    String requestBody = new Gson().toJson(parameters);

    HttpPost request = new HttpPost("https://api.zyte.com/v1/extract");
    request.setHeader(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON);
    request.setHeader(HttpHeaders.ACCEPT_ENCODING, "gzip, deflate");
    request.setHeader(HttpHeaders.AUTHORIZATION, buildAuthHeader());
    request.setEntity(new StringEntity(requestBody));

    try (CloseableHttpClient client = HttpClients.createDefault()) {
      try (CloseableHttpResponse response = client.execute(request)) {
        HttpEntity entity = response.getEntity();
        String apiResponse = EntityUtils.toString(entity, StandardCharsets.UTF_8);
        JsonObject jsonObject = JsonParser.parseString(apiResponse).getAsJsonObject();
        JsonArray httpResponseHeaders = jsonObject.get("httpResponseHeaders").getAsJsonArray();
      }
    }
  }

  private static String buildAuthHeader() {
    String auth = API_KEY + ":";
    String encodedAuth = Base64.getEncoder().encodeToString(auth.getBytes());
    return "Basic " + encodedAuth;
  }
}
const axios = require('axios')

axios.post(
    'https://api.zyte.com/v1/extract',
    {
      url: 'https://toscrape.com',
      httpResponseHeaders: true
    },
    {
      auth: { username: 'YOUR_API_KEY' },
      headers: { 'Accept-Encoding': 'gzip, deflate' }
    }
  ).then((response) => {
    const httpResponseHeaders = response.data.httpResponseHeaders
  })
<?php

$client = new GuzzleHttp\Client();
$response = $client->request('POST', 'https://api.zyte.com/v1/extract', [
    'auth' => ['YOUR_API_KEY', ''],
    'headers' => ['Accept-Encoding' => 'gzip'],
    'json' => [
        'url' => 'https://toscrape.com',
        'httpResponseHeaders' => true,
    ],
]);
$api = json_decode($response->getBody());
$http_response_headers = $api->httpResponseHeaders;
import requests

api_response = requests.post(
    'https://api.zyte.com/v1/extract',
    auth=('YOUR_API_KEY', ''),
    json={
        'url': 'https://toscrape.com',
        'httpResponseHeaders': True,
    },
)
http_response_headers = api_response.json()['httpResponseHeaders']
import asyncio

from zyte_api.aio.client import AsyncClient

async def main():
    client = AsyncClient()
    api_response = await client.request_raw(
        {
            'url': 'https://toscrape.com',
            'httpResponseHeaders': True,
        }
    )
    http_response_headers = api_response['httpResponseHeaders']

asyncio.run(main())
from scrapy import Request, Spider


class ToScrapeComSpider(Spider):
    name = "toscrape_com"

    def start_requests(self):
        yield Request(
            "https://toscrape.com",
            meta={
                "zyte_api_automap": {
                    "httpResponseBody": False,
                    "httpResponseHeaders": True,
                },
            },
        )

    def parse(self, response):
        headers = response.headers

Note

In transparent mode, httpResponseHeaders is sent by default for httpResponseBody requests, but sending it explicitly is still recommended, as future versions of scrapy-zyte-api may stop sending it by default.

Zyte API may exclude some headers from the result, such as Set-Cookie.

Set a country of origin#

When extracting browser HTML, a screenshot, or a response body,set the geolocation key in your API request body to a supported ISO 3166-1 alpha-2 country code to channel your request through an IP address associated with the corresponding country.

using System.Collections.Generic;
using System.Net;
using System.Net.Http;
using System.Text;
using System.Text.Json;
using System.Threading.Tasks;

HttpClientHandler handler = new HttpClientHandler()
{
    AutomaticDecompression = DecompressionMethods.All
};
HttpClient client = new HttpClient(handler);

var apiKey = "YOUR_API_KEY";
var bytes = Encoding.GetEncoding("ISO-8859-1").GetBytes(apiKey + ":");
var auth = System.Convert.ToBase64String(bytes);
client.DefaultRequestHeaders.Add("Authorization", "Basic " + auth);

client.DefaultRequestHeaders.Add("Accept-Encoding", "br, gzip, deflate");

var input = new Dictionary<string, object>(){
    {"url", "http://ip-api.com/json"},
    {"httpResponseBody", true},
    {"geolocation", "AU"}
};
var inputJson = JsonSerializer.Serialize(input);
var content = new StringContent(inputJson, Encoding.UTF8, "application/json");

HttpResponseMessage response = await client.PostAsync("https://api.zyte.com/v1/extract", content);
var body = await response.Content.ReadAsByteArrayAsync();

var data = JsonDocument.Parse(body);
var base64HttpResponseBody = data.RootElement.GetProperty("httpResponseBody").ToString();
var httpResponseBody = System.Convert.FromBase64String(base64HttpResponseBody);

var responseData = JsonDocument.Parse(httpResponseBody);
var countryCode = responseData.RootElement.GetProperty("countryCode").ToString();
input.json#
{"url": "http://ip-api.com/json", "httpResponseBody": true, "geolocation": "AU"}
curl \
    --user YOUR_API_KEY: \
    --header 'Content-Type: application/json' \
    --data @input.json \
    --compressed \
    https://api.zyte.com/v1/extract \
| jq --raw-output .httpResponseBody \
| base64 --decode \
| jq .countryCode
input.jsonl#
{"url": "http://ip-api.com/json", "httpResponseBody": true, "geolocation": "AU"}
zyte-api input.jsonl 2> /dev/null \
| jq --raw-output .httpResponseBody \
| base64 --decode \
| jq .countryCode
import com.google.common.collect.ImmutableMap;
import com.google.gson.Gson;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Base64;
import java.util.Map;
import org.apache.hc.client5.http.classic.methods.HttpPost;
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
import org.apache.hc.client5.http.impl.classic.CloseableHttpResponse;
import org.apache.hc.client5.http.impl.classic.HttpClients;
import org.apache.hc.core5.http.ContentType;
import org.apache.hc.core5.http.HttpEntity;
import org.apache.hc.core5.http.HttpHeaders;
import org.apache.hc.core5.http.ParseException;
import org.apache.hc.core5.http.io.entity.EntityUtils;
import org.apache.hc.core5.http.io.entity.StringEntity;

class Example {
  private static final String API_KEY = "YOUR_API_KEY";

  public static void main(final String[] args)
      throws InterruptedException, IOException, ParseException {
    Map<String, Object> parameters =
        ImmutableMap.of(
            "url", "http://ip-api.com/json", "httpResponseBody", true, "geolocation", "AU");
    String requestBody = new Gson().toJson(parameters);

    HttpPost request = new HttpPost("https://api.zyte.com/v1/extract");
    request.setHeader(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON);
    request.setHeader(HttpHeaders.ACCEPT_ENCODING, "gzip, deflate");
    request.setHeader(HttpHeaders.AUTHORIZATION, buildAuthHeader());
    request.setEntity(new StringEntity(requestBody));

    try (CloseableHttpClient client = HttpClients.createDefault()) {
      try (CloseableHttpResponse response = client.execute(request)) {
        HttpEntity entity = response.getEntity();
        String apiResponse = EntityUtils.toString(entity, StandardCharsets.UTF_8);
        JsonObject jsonObject = JsonParser.parseString(apiResponse).getAsJsonObject();
        String base64HttpResponseBody = jsonObject.get("httpResponseBody").getAsString();
        byte[] httpResponseBodyBytes = Base64.getDecoder().decode(base64HttpResponseBody);
        String httpResponseBody = new String(httpResponseBodyBytes, StandardCharsets.UTF_8);
        JsonObject data = JsonParser.parseString(httpResponseBody).getAsJsonObject();
        String countryCode = data.get("countryCode").getAsString();
      }
    }
  }

  private static String buildAuthHeader() {
    String auth = API_KEY + ":";
    String encodedAuth = Base64.getEncoder().encodeToString(auth.getBytes());
    return "Basic " + encodedAuth;
  }
}
const axios = require('axios')

axios.post(
    'https://api.zyte.com/v1/extract',
    {
      url: 'http://ip-api.com/json',
      httpResponseBody: true,
      geolocation: 'AU'
    },
    {
      auth: { username: 'YOUR_API_KEY' },
      headers: { 'Accept-Encoding': 'gzip, deflate' }
    }
  ).then((response) => {
    const httpResponseBody = Buffer.from(
      response.data.httpResponseBody,
      'base64'
    )
    const data = JSON.parse(httpResponseBody)
    const countryCode = data.countryCode
  })
<?php

$client = new GuzzleHttp\Client();
$response = $client->request('POST', 'https://api.zyte.com/v1/extract', [
    'auth' => ['YOUR_API_KEY', ''],
    'headers' => ['Accept-Encoding' => 'gzip'],
    'json' => [
        'url' => 'http://ip-api.com/json',
        'httpResponseBody' => true,
        'geolocation' => 'AU',
    ],
]);
$api = json_decode($response->getBody());
$http_response_body = base64_decode($api->httpResponseBody);
$data = json_decode($http_response_body);
$country_code = $data->countryCode;
import json
from base64 import b64decode

import requests

api_response = requests.post(
    'https://api.zyte.com/v1/extract',
    auth=('YOUR_API_KEY', ''),
    json={
        'url': 'http://ip-api.com/json',
        'httpResponseBody': True,
        'geolocation': 'AU',
    },
)
http_response_body: bytes = b64decode(
    api_response.json()['httpResponseBody']
)
response_data = json.loads(http_response_body)
country_code = response_data['countryCode']
import asyncio
import json
from base64 import b64decode

from zyte_api.aio.client import AsyncClient

async def main():
    client = AsyncClient()
    api_response = await client.request_raw(
        {
            'url': 'http://ip-api.com/json',
            'httpResponseBody': True,
            'geolocation': 'AU',
        }
    )
    http_response_body: bytes = b64decode(
        api_response['httpResponseBody']
    )
    response_data = json.loads(http_response_body)
    country_code = response_data['countryCode']

asyncio.run(main())
import json

from scrapy import Request, Spider


class IPAPIComSpider(Spider):
    name = "ip_api_com"

    def start_requests(self):
        yield Request(
            "http://ip-api.com/json",
            meta={
                "zyte_api_automap": {
                    "geolocation": "AU",
                },
            },
        )

    def parse(self, response):
        response_data = json.loads(response.body)
        country_code = response_data["countryCode"]

Look up the geolocation key in the specification for the list of supported countries.

When the geolocation key is not specified, Zyte API aims to channel your request through a country that ensures a good response from the target website, meaning that the chosen country:

  • Does not cause unexpected locale changes in the response data, such as the wrong language, currency, date format, time zone, etc.

  • Does not cause your request to be banned.

Zyte API can use countries of origin beyond those supported by the geolocation key. For example, if you access a Turkish website, Zyte API may access the website from Türkiye as long as you do not specify otherwise through the geolocation key, even though geolocation does not support TR as a value at the moment.

Set request metadata#

Set the echoData key in your API request body to an arbitrary value, to get that value verbatim in the API response.

When sending multiple requests in parallel, this can be useful, for example, to keep track of the original request order.

using System.Collections.Generic;
using System.Linq;
using System.Net;
using System.Net.Http;
using System.Text;
using System.Text.Json;
using System.Threading.Tasks;

var inputData = new List<List<object>>()
{
    new List<object>(){"https://toscrape.com", 1},
    new List<object>(){"https://books.toscrape.com", 2},
    new List<object>(){"https://quotes.toscrape.com", 3},
};
var output = new List<HttpResponseMessage>();

var handler = new HttpClientHandler()
{
    AutomaticDecompression = DecompressionMethods.All,
    MaxConnectionsPerServer = 15
};
var client = new HttpClient(handler);

var apiKey = "YOUR_API_KEY";
var bytes = Encoding.GetEncoding("ISO-8859-1").GetBytes(apiKey + ":");
var auth = System.Convert.ToBase64String(bytes);
client.DefaultRequestHeaders.Add("Authorization", "Basic " + auth);

client.DefaultRequestHeaders.Add("Accept-Encoding", "br, gzip, deflate");

var responseTasks = new List<Task<HttpResponseMessage>>();
foreach (var entry in inputData)
{
    var input = new Dictionary<string, object>(){
        {"url", entry[0]},
        {"browserHtml", true},
        {"echoData", entry[1]}
    };
    var inputJson = JsonSerializer.Serialize(input);
    var content = new StringContent(inputJson, Encoding.UTF8, "application/json");
    var responseTask = client.PostAsync("https://api.zyte.com/v1/extract", content);
    responseTasks.Add(responseTask);
}

while (responseTasks.Any())
{
    var responseTask = await Task.WhenAny(responseTasks);
    responseTasks.Remove(responseTask);
    var response = await responseTask;
    output.Add(response);
}
input.jsonl#
{"url": "https://toscrape.com", "browserHtml": true, "echoData": 1}
{"url": "https://books.toscrape.com", "browserHtml": true, "echoData": 2}
{"url": "https://quotes.toscrape.com", "browserHtml": true, "echoData": 3}
cat input.jsonl \
| xargs -P 15 -d\\n -n 1 \
bash -c "
    curl \
        --user $ZYTE_API_KEY: \
        --header 'Content-Type: application/json' \
        --data \"\$0\" \
        --compressed \
        https://api.zyte.com/v1/extract \
    | jq .echoData \
    | awk '{print \$1}' \
    >> output.jsonl
"
input.jsonl#
{"url": "https://toscrape.com", "browserHtml": true, "echoData": 1}
{"url": "https://books.toscrape.com", "browserHtml": true, "echoData": 2}
{"url": "https://quotes.toscrape.com", "browserHtml": true, "echoData": 3}
zyte-api --n-conn 15 input.jsonl -o output.jsonl
import com.google.common.collect.ImmutableMap;
import com.google.gson.Gson;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Base64;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Future;
import org.apache.hc.client5.http.async.methods.SimpleHttpRequest;
import org.apache.hc.client5.http.async.methods.SimpleHttpResponse;
import org.apache.hc.client5.http.impl.async.CloseableHttpAsyncClient;
import org.apache.hc.client5.http.impl.async.HttpAsyncClients;
import org.apache.hc.client5.http.impl.nio.PoolingAsyncClientConnectionManager;
import org.apache.hc.client5.http.impl.nio.PoolingAsyncClientConnectionManagerBuilder;
import org.apache.hc.client5.http.ssl.ClientTlsStrategyBuilder;
import org.apache.hc.core5.concurrent.FutureCallback;
import org.apache.hc.core5.http.ContentType;
import org.apache.hc.core5.http.HttpHeaders;
import org.apache.hc.core5.http.ParseException;
import org.apache.hc.core5.http.nio.ssl.TlsStrategy;
import org.apache.hc.core5.reactor.ssl.TlsDetails;

class Example {
  private static final String API_KEY = "YOUR_API_KEY";

  public static void main(final String[] args)
      throws ExecutionException, InterruptedException, IOException, ParseException {

    Object[][] input = {
      {"https://toscrape.com", 1},
      {"https://bookstoscrape.com", 2},
      {"https://quotes.toscrape.com", 3}
    };
    List<Future> futures = new ArrayList<Future>();
    List<String> output = new ArrayList<String>();

    int concurrency = 15;

    // https://issues.apache.org/jira/browse/HTTPCLIENT-2219
    final TlsStrategy tlsStrategy =
        ClientTlsStrategyBuilder.create()
            .useSystemProperties()
            .setTlsDetailsFactory(
                sslEngine ->
                    new TlsDetails(sslEngine.getSession(), sslEngine.getApplicationProtocol()))
            .build();

    PoolingAsyncClientConnectionManager connectionManager =
        PoolingAsyncClientConnectionManagerBuilder.create().setTlsStrategy(tlsStrategy).build();
    connectionManager.setMaxTotal(concurrency);
    connectionManager.setDefaultMaxPerRoute(concurrency);

    CloseableHttpAsyncClient client =
        HttpAsyncClients.custom().setConnectionManager(connectionManager).build();
    try {
      client.start();
      for (int i = 0; i < input.length; i++) {
        Map<String, Object> parameters =
            ImmutableMap.of("url", input[i][0], "browserHtml", true, "echoData", input[i][1]);
        String requestBody = new Gson().toJson(parameters);

        SimpleHttpRequest request =
            new SimpleHttpRequest("POST", "https://api.zyte.com/v1/extract");
        request.setHeader(HttpHeaders.AUTHORIZATION, buildAuthHeader());
        request.setHeader(HttpHeaders.ACCEPT_ENCODING, "gzip, deflate");
        request.setBody(requestBody, ContentType.APPLICATION_JSON);

        final Future<SimpleHttpResponse> future =
            client.execute(
                request,
                new FutureCallback<SimpleHttpResponse>() {
                  public void completed(final SimpleHttpResponse response) {
                    String apiResponse = response.getBodyText();
                    output.add(apiResponse);
                  }

                  public void failed(final Exception ex) {}

                  public void cancelled() {}
                });
        futures.add(future);
      }
      for (int i = 0; i < futures.size(); i++) {
        futures.get(i).get();
      }
    } finally {
      client.close();
    }
  }

  private static String buildAuthHeader() {
    String auth = API_KEY + ":";
    String encodedAuth = Base64.getEncoder().encodeToString(auth.getBytes());
    return "Basic " + encodedAuth;
  }
}
const { ConcurrencyManager } = require('axios-concurrency')
const axios = require('axios')

const urls = [
  ['https://toscrape.com', 1],
  ['https://books.toscrape.com', 2],
  ['https://quotes.toscrape.com', 3]
]
const output = []

const client = axios.create()
ConcurrencyManager(client, 15)

Promise.all(
  urls.map((input) =>
    client.post(
        'https://api.zyte.com/v1/extract',
        { url: input[0], browserHtml: true, echoData: input[1] },
        { 
          auth: { username: 'YOUR_API_KEY' },
          headers: { 'Accept-Encoding': 'gzip, deflate' }
        }
      ).then((response) => output.push(response.data))
  )
)
<?php

$input = [
    ['https://toscrape.com', 1],
    ['https://books.toscrape.com', 2],
    ['https://quotes.toscrape.com', 3],
];
$output = [];
$promises = [];

$client = new GuzzleHttp\Client();

foreach ($input as $url_and_index) {
    $options = [
        'auth' => ['YOUR_API_KEY', ''],
        'headers' => ['Accept-Encoding' => 'gzip'],
        'json' => [
            'url' => $url_and_index[0],
            'browserHtml' => true,
            'echoData' => $url_and_index[1],
        ],
    ];
    $request = new \GuzzleHttp\Psr7\Request('POST', 'https://api.zyte.com/v1/extract');
    global $promises;
    $promises[] = $client->sendAsync($request, $options)->then(function ($response) {
        global $output;
        $output[] = json_decode($response->getBody());
    });
}

foreach ($promises as $promise) {
    $promise->wait();
}
import asyncio

import aiohttp

input_data = [
    ('https://toscrape.com', 1),
    ('https://books.toscrape.com', 2),
    ('https://quotes.toscrape.com', 3),
]
output = []


async def extract(client, url, index):
    response = await client.post(
        'https://api.zyte.com/v1/extract',
        json={'url': url, 'browserHtml': True, 'echoData': index},
        auth=aiohttp.BasicAuth('YOUR_API_KEY'),
    )
    output.append(await response.json())


async def main():
    connector = aiohttp.TCPConnector(limit_per_host=15)
    async with aiohttp.ClientSession(connector=connector) as client:
        await asyncio.gather(
            *[extract(client, url, index) for url, index in input_data]
        )

asyncio.run(main())
import asyncio

from zyte_api.aio.client import AsyncClient, create_session

input_data = [
    ('https://toscrape.com', 1),
    ('https://books.toscrape.com', 2),
    ('https://quotes.toscrape.com', 3),
]
output = []


async def main():
    connection_count = 15
    client = AsyncClient(n_conn=connection_count)
    requests = [
        {'url': url, 'browserHtml': True, 'echoData': index}
        for url, index in input_data
    ]
    async with create_session(connection_count) as session:
        responses = client.request_parallel_as_completed(
            requests,
            session=session,
        )
        for response in responses:
            output.append(await response)

asyncio.run(main())
from scrapy import Request, Spider

input_data = [
    ("https://toscrape.com", 1),
    ("https://books.toscrape.com", 2),
    ("https://quotes.toscrape.com", 3),
]


class ToScrapeSpider(Spider):
    name = "toscrape_com"

    custom_settings = {
        "CONCURRENT_REQUESTS": 15,
        "CONCURRENT_REQUESTS_PER_DOMAIN": 15,
    }

    def start_requests(self):
        for url, index in input_data:
            yield Request(
                url,
                meta={
                    "zyte_api_automap": {
                        "browserHtml": True,
                        "echoData": index,
                    },
                },
            )

    def parse(self, response):
        yield {
            "index": response.raw_api_response["echoData"],
            "html": response.text,
        }

Alternatively, you can use Scrapy’s Request.cb_kwargs directly for a similar purpose:


    def start_requests(self):
        for url, index in input_data:
            yield Request(
                url,
                cb_kwargs={"index": index},
                meta={
                    "zyte_api_automap": {
                        "browserHtml": True,
                    },
                },
            )

    def parse(self, response, index):
        yield {
            "index": index,
            "html": response.text,
        }

There is another metadata field that you can set and get verbatim on the API response: jobId. When running your requests from a Zyte Scrapy Cloud job, this field is meant to indicate the corresponding job ID. scrapy-zyte-api fills this field automatically.