Zyte API HTTP requests#

To send HTTP requests through Zyte API, without browser rendering, set the httpResponseBody request field to true, and read the Base64-encoded response body from the httpResponseBody response field.

Example

Note

Install and configure code example requirements and the Zyte CA certificate to run the example below.

using System.Collections.Generic;
using System.Net;
using System.Net.Http;
using System.Text;
using System.Text.Json;
using System.Threading.Tasks;

HttpClientHandler handler = new HttpClientHandler()
{
    AutomaticDecompression = DecompressionMethods.All
};
HttpClient client = new HttpClient(handler);

var apiKey = "YOUR_API_KEY";
var bytes = Encoding.GetEncoding("ISO-8859-1").GetBytes(apiKey + ":");
var auth = System.Convert.ToBase64String(bytes);
client.DefaultRequestHeaders.Add("Authorization", "Basic " + auth);

client.DefaultRequestHeaders.Add("Accept-Encoding", "br, gzip, deflate");

var input = new Dictionary<string, object>(){
    {"url", "https://toscrape.com"},
    {"httpResponseBody", true}
};
var inputJson = JsonSerializer.Serialize(input);
var content = new StringContent(inputJson, Encoding.UTF8, "application/json");

HttpResponseMessage response = await client.PostAsync("https://api.zyte.com/v1/extract", content);
var body = await response.Content.ReadAsByteArrayAsync();

var data = JsonDocument.Parse(body);
var base64HttpResponseBody = data.RootElement.GetProperty("httpResponseBody").ToString();
var httpResponseBody = System.Convert.FromBase64String(base64HttpResponseBody);
input.jsonl#
{"url": "https://toscrape.com", "httpResponseBody": true}
zyte-api input.jsonl \
    | jq --raw-output .httpResponseBody \
    | base64 --decode \
    > output.html
input.json#
{
    "url": "https://toscrape.com",
    "httpResponseBody": true
}
curl \
    --user YOUR_API_KEY: \
    --header 'Content-Type: application/json' \
    --data @input.json \
    --compressed \
    https://api.zyte.com/v1/extract \
    | jq --raw-output .httpResponseBody \
    | base64 --decode \
    > output.html
import com.google.common.collect.ImmutableMap;
import com.google.gson.Gson;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Base64;
import java.util.Map;
import org.apache.hc.client5.http.classic.methods.HttpPost;
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
import org.apache.hc.client5.http.impl.classic.CloseableHttpResponse;
import org.apache.hc.client5.http.impl.classic.HttpClients;
import org.apache.hc.core5.http.ContentType;
import org.apache.hc.core5.http.HttpEntity;
import org.apache.hc.core5.http.HttpHeaders;
import org.apache.hc.core5.http.ParseException;
import org.apache.hc.core5.http.io.entity.EntityUtils;
import org.apache.hc.core5.http.io.entity.StringEntity;

class Example {
  private static final String API_KEY = "YOUR_API_KEY";

  public static void main(final String[] args)
      throws InterruptedException, IOException, ParseException {
    Map<String, Object> parameters =
        ImmutableMap.of("url", "https://toscrape.com", "httpResponseBody", true);
    String requestBody = new Gson().toJson(parameters);

    HttpPost request = new HttpPost("https://api.zyte.com/v1/extract");
    request.setHeader(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON);
    request.setHeader(HttpHeaders.ACCEPT_ENCODING, "gzip, deflate");
    request.setHeader(HttpHeaders.AUTHORIZATION, buildAuthHeader());
    request.setEntity(new StringEntity(requestBody));

    try (CloseableHttpClient client = HttpClients.createDefault()) {
      try (CloseableHttpResponse response = client.execute(request)) {
        HttpEntity entity = response.getEntity();
        String apiResponse = EntityUtils.toString(entity, StandardCharsets.UTF_8);
        JsonObject jsonObject = JsonParser.parseString(apiResponse).getAsJsonObject();
        String base64HttpResponseBody = jsonObject.get("httpResponseBody").getAsString();
        byte[] httpResponseBodyBytes = Base64.getDecoder().decode(base64HttpResponseBody);
        String httpResponseBody = new String(httpResponseBodyBytes, StandardCharsets.UTF_8);
      }
    }
  }

  private static String buildAuthHeader() {
    String auth = API_KEY + ":";
    String encodedAuth = Base64.getEncoder().encodeToString(auth.getBytes());
    return "Basic " + encodedAuth;
  }
}
const axios = require('axios')

axios.post(
  'https://api.zyte.com/v1/extract',
  {
    url: 'https://toscrape.com',
    httpResponseBody: true
  },
  {
    auth: { username: 'YOUR_API_KEY' }
  }
).then((response) => {
  const httpResponseBody = Buffer.from(
    response.data.httpResponseBody,
    'base64'
  )
})
<?php

$client = new GuzzleHttp\Client();
$response = $client->request('POST', 'https://api.zyte.com/v1/extract', [
    'auth' => ['YOUR_API_KEY', ''],
    'headers' => ['Accept-Encoding' => 'gzip'],
    'json' => [
        'url' => 'https://toscrape.com',
        'httpResponseBody' => true,
    ],
]);
$data = json_decode($response->getBody());
$http_response_body = base64_decode($data->httpResponseBody);

With the proxy mode, you always get a response body.

curl \
    --proxy api.zyte.com:8011 \
    --proxy-user YOUR_API_KEY: \
    --compressed \
    https://toscrape.com \
> output.html
from base64 import b64decode

import requests

api_response = requests.post(
    "https://api.zyte.com/v1/extract",
    auth=("YOUR_API_KEY", ""),
    json={
        "url": "https://toscrape.com",
        "httpResponseBody": True,
    },
)
http_response_body: bytes = b64decode(api_response.json()["httpResponseBody"])
import asyncio
from base64 import b64decode

from zyte_api import AsyncZyteAPI


async def main():
    client = AsyncZyteAPI()
    api_response = await client.get(
        {
            "url": "https://toscrape.com",
            "httpResponseBody": True,
        }
    )
    http_response_body = b64decode(api_response["httpResponseBody"]).decode()
    print(http_response_body)


asyncio.run(main())

In transparent mode, when you target a text resource (e.g. HTML, JSON), regular Scrapy requests work out of the box:

from scrapy import Spider


class ToScrapeSpider(Spider):
    name = "toscrape_com"
    start_urls = ["https://toscrape.com"]

    def parse(self, response):
        http_response_text: str = response.text

While regular Scrapy requests also work for binary responses at the moment, they may stop working in future versions of scrapy-zyte-api, so passing httpResponseBody is recommended when targeting binary resources:

from scrapy import Request, Spider


class ToScrapeSpider(Spider):
    name = "toscrape_com"

    def start_requests(self):
        yield Request(
            "https://toscrape.com",
            meta={
                "zyte_api_automap": {
                    "httpResponseBody": True,
                },
            },
        )

    def parse(self, response):
        http_response_body: bytes = response.body

Output (first 5 lines):

<!DOCTYPE html>
<html lang="en">
    <head>
        <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
        <title>Scraping Sandbox</title>

For HTTP requests, Zyte API also supports:

Tip

HTTP responses do not reflect HTML content rendered by a web browser that executes JavaScript code. To get browser HTML, use a browser request. See also HTML and browser HTML.

Request method#

HTTP requests use the GET HTTP method by default. Use the httpRequestMethod field to set a different HTTP method.

Tip

When using POST, PUT or similar, you probably want to also set a request body.

Example

Note

Install and configure code example requirements and the Zyte CA certificate to run the example below.

using System.Collections.Generic;
using System.Net;
using System.Net.Http;
using System.Text;
using System.Text.Json;
using System.Threading.Tasks;

HttpClientHandler handler = new HttpClientHandler()
{
    AutomaticDecompression = DecompressionMethods.All
};
HttpClient client = new HttpClient(handler);

var apiKey = "YOUR_API_KEY";
var bytes = Encoding.GetEncoding("ISO-8859-1").GetBytes(apiKey + ":");
var auth = System.Convert.ToBase64String(bytes);
client.DefaultRequestHeaders.Add("Authorization", "Basic " + auth);

client.DefaultRequestHeaders.Add("Accept-Encoding", "br, gzip, deflate");

var input = new Dictionary<string, object>(){
    {"url", "https://httpbin.org/anything"},
    {"httpResponseBody", true},
    {"httpRequestMethod", "POST"}
};
var inputJson = JsonSerializer.Serialize(input);
var content = new StringContent(inputJson, Encoding.UTF8, "application/json");

HttpResponseMessage response = await client.PostAsync("https://api.zyte.com/v1/extract", content);
var body = await response.Content.ReadAsByteArrayAsync();

var data = JsonDocument.Parse(body);
var base64HttpResponseBody = data.RootElement.GetProperty("httpResponseBody").ToString();
var httpResponseBody = System.Convert.FromBase64String(base64HttpResponseBody);

var responseData = JsonDocument.Parse(httpResponseBody);
var method = responseData.RootElement.GetProperty("method").ToString();
input.jsonl#
{"url": "https://httpbin.org/anything", "httpResponseBody": true, "httpRequestMethod": "POST"}
zyte-api input.jsonl \
    | jq --raw-output .httpResponseBody \
    | base64 --decode \
    | jq .method
input.json#
{
    "url": "https://httpbin.org/anything",
    "httpResponseBody": true,
    "httpRequestMethod": "POST"
}
curl \
    --user YOUR_API_KEY: \
    --header 'Content-Type: application/json' \
    --data @input.json \
    --compressed \
    https://api.zyte.com/v1/extract \
    | jq --raw-output .httpResponseBody \
    | base64 --decode \
    | jq .method
import com.google.common.collect.ImmutableMap;
import com.google.gson.Gson;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Base64;
import java.util.Map;
import org.apache.hc.client5.http.classic.methods.HttpPost;
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
import org.apache.hc.client5.http.impl.classic.CloseableHttpResponse;
import org.apache.hc.client5.http.impl.classic.HttpClients;
import org.apache.hc.core5.http.ContentType;
import org.apache.hc.core5.http.HttpEntity;
import org.apache.hc.core5.http.HttpHeaders;
import org.apache.hc.core5.http.ParseException;
import org.apache.hc.core5.http.io.entity.EntityUtils;
import org.apache.hc.core5.http.io.entity.StringEntity;

class Example {
  private static final String API_KEY = "YOUR_API_KEY";

  public static void main(final String[] args)
      throws InterruptedException, IOException, ParseException {
    Map<String, Object> parameters =
        ImmutableMap.of(
            "url",
            "https://httpbin.org/anything",
            "httpResponseBody",
            true,
            "httpRequestMethod",
            "POST");
    String requestBody = new Gson().toJson(parameters);

    HttpPost request = new HttpPost("https://api.zyte.com/v1/extract");
    request.setHeader(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON);
    request.setHeader(HttpHeaders.ACCEPT_ENCODING, "gzip, deflate");
    request.setHeader(HttpHeaders.AUTHORIZATION, buildAuthHeader());
    request.setEntity(new StringEntity(requestBody));

    try (CloseableHttpClient client = HttpClients.createDefault()) {
      try (CloseableHttpResponse response = client.execute(request)) {
        HttpEntity entity = response.getEntity();
        String apiResponse = EntityUtils.toString(entity, StandardCharsets.UTF_8);
        JsonObject jsonObject = JsonParser.parseString(apiResponse).getAsJsonObject();
        String base64HttpResponseBody = jsonObject.get("httpResponseBody").getAsString();
        byte[] httpResponseBodyBytes = Base64.getDecoder().decode(base64HttpResponseBody);
        String httpResponseBody = new String(httpResponseBodyBytes, StandardCharsets.UTF_8);
        JsonObject data = JsonParser.parseString(httpResponseBody).getAsJsonObject();
        String method = data.get("method").getAsString();
      }
    }
  }

  private static String buildAuthHeader() {
    String auth = API_KEY + ":";
    String encodedAuth = Base64.getEncoder().encodeToString(auth.getBytes());
    return "Basic " + encodedAuth;
  }
}
const axios = require('axios')

axios.post(
  'https://api.zyte.com/v1/extract',
  {
    url: 'https://httpbin.org/anything',
    httpResponseBody: true,
    httpRequestMethod: 'POST'
  },
  {
    auth: { username: 'YOUR_API_KEY' }
  }
).then((response) => {
  const httpResponseBody = Buffer.from(
    response.data.httpResponseBody,
    'base64'
  )
  const method = JSON.parse(httpResponseBody).method
})
<?php

$client = new GuzzleHttp\Client();
$response = $client->request('POST', 'https://api.zyte.com/v1/extract', [
    'auth' => ['YOUR_API_KEY', ''],
    'headers' => ['Accept-Encoding' => 'gzip'],
    'json' => [
        'url' => 'https://httpbin.org/anything',
        'httpResponseBody' => true,
        'httpRequestMethod' => 'POST',
    ],
]);
$data = json_decode($response->getBody());
$http_response_body = base64_decode($data->httpResponseBody);
$method = json_decode($http_response_body)->method;

With the proxy mode, the request method from your requests is used automatically.

curl \
    --proxy api.zyte.com:8011 \
    --proxy-user YOUR_API_KEY: \
    --compressed \
    -X POST \
    https://httpbin.org/anything \
    | jq .method
import json
from base64 import b64decode

import requests

api_response = requests.post(
    "https://api.zyte.com/v1/extract",
    auth=("YOUR_API_KEY", ""),
    json={
        "url": "https://httpbin.org/anything",
        "httpResponseBody": True,
        "httpRequestMethod": "POST",
    },
)
http_response_body = b64decode(api_response.json()["httpResponseBody"])
method = json.loads(http_response_body)["method"]
import asyncio
import json
from base64 import b64decode

from zyte_api import AsyncZyteAPI


async def main():
    client = AsyncZyteAPI()
    api_response = await client.get(
        {
            "url": "https://httpbin.org/anything",
            "httpResponseBody": True,
            "httpRequestMethod": "POST",
        }
    )
    http_response_body: bytes = b64decode(api_response["httpResponseBody"])
    method = json.loads(http_response_body)["method"]
    print(method)


asyncio.run(main())
import json

from scrapy import Request, Spider


class HTTPBinOrgSpider(Spider):
    name = "httpbin_org"

    def start_requests(self):
        yield Request(
            "https://httpbin.org/anything",
            method="POST",
        )

    def parse(self, response):
        method = json.loads(response.text)["method"]

Output:

"POST"

Request body#

To include a body in your request, use one of the following fields:

httpRequestText example

Note

Install and configure code example requirements and the Zyte CA certificate to run the example below.

using System;
using System.Collections.Generic;
using System.Net;
using System.Net.Http;
using System.Text;
using System.Text.Json;
using System.Threading.Tasks;

HttpClientHandler handler = new HttpClientHandler()
{
    AutomaticDecompression = DecompressionMethods.All
};
HttpClient client = new HttpClient(handler);

var apiKey = "YOUR_API_KEY";
var bytes = Encoding.GetEncoding("ISO-8859-1").GetBytes(apiKey + ":");
var auth = System.Convert.ToBase64String(bytes);
client.DefaultRequestHeaders.Add("Authorization", "Basic " + auth);

client.DefaultRequestHeaders.Add("Accept-Encoding", "br, gzip, deflate");

var input = new Dictionary<string, object>(){
    {"url", "https://httpbin.org/anything"},
    {"httpResponseBody", true},
    {"httpRequestMethod", "POST"},
    {"httpRequestText", "{\"foo\": \"bar\"}"}
};
var inputJson = JsonSerializer.Serialize(input);
var content = new StringContent(inputJson, Encoding.UTF8, "application/json");

HttpResponseMessage response = await client.PostAsync("https://api.zyte.com/v1/extract", content);
var body = await response.Content.ReadAsByteArrayAsync();

var data = JsonDocument.Parse(body);
var base64HttpResponseBody = data.RootElement.GetProperty("httpResponseBody").ToString();
var httpResponseBody = System.Convert.FromBase64String(base64HttpResponseBody);

var responseData = JsonDocument.Parse(httpResponseBody);
var requestBody = responseData.RootElement.GetProperty("data").ToString();

Console.WriteLine(requestBody);
input.jsonl#
{"url": "https://httpbin.org/anything", "httpResponseBody": true, "httpRequestMethod": "POST", "httpRequestText": "{\"foo\": \"bar\"}"}
zyte-api input.jsonl \
    | jq --raw-output .httpResponseBody \
    | base64 --decode \
    | jq --raw-output .data
input.json#
{
    "url": "https://httpbin.org/anything",
    "httpResponseBody": true,
    "httpRequestMethod": "POST",
    "httpRequestText": "{\"foo\": \"bar\"}"
}
curl \
    --user YOUR_API_KEY: \
    --header 'Content-Type: application/json' \
    --data @input.json \
    --compressed \
    https://api.zyte.com/v1/extract \
| jq --raw-output .httpResponseBody \
| base64 --decode \
| jq --raw-output .data
import com.google.common.collect.ImmutableMap;
import com.google.gson.Gson;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Base64;
import java.util.Map;
import org.apache.hc.client5.http.classic.methods.HttpPost;
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
import org.apache.hc.client5.http.impl.classic.CloseableHttpResponse;
import org.apache.hc.client5.http.impl.classic.HttpClients;
import org.apache.hc.core5.http.ContentType;
import org.apache.hc.core5.http.HttpEntity;
import org.apache.hc.core5.http.HttpHeaders;
import org.apache.hc.core5.http.ParseException;
import org.apache.hc.core5.http.io.entity.EntityUtils;
import org.apache.hc.core5.http.io.entity.StringEntity;

class Example {
  private static final String API_KEY = "YOUR_API_KEY";

  public static void main(final String[] args)
      throws InterruptedException, IOException, ParseException {
    Map<String, Object> parameters =
        ImmutableMap.of(
            "url",
            "https://httpbin.org/anything",
            "httpResponseBody",
            true,
            "httpRequestMethod",
            "POST",
            "httpRequestText",
            "{\"foo\": \"bar\"}");
    String requestBody = new Gson().toJson(parameters);

    HttpPost request = new HttpPost("https://api.zyte.com/v1/extract");
    request.setHeader(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON);
    request.setHeader(HttpHeaders.ACCEPT_ENCODING, "gzip, deflate");
    request.setHeader(HttpHeaders.AUTHORIZATION, buildAuthHeader());
    request.setEntity(new StringEntity(requestBody));

    try (CloseableHttpClient client = HttpClients.createDefault()) {
      try (CloseableHttpResponse response = client.execute(request)) {
        HttpEntity entity = response.getEntity();
        String apiResponse = EntityUtils.toString(entity, StandardCharsets.UTF_8);
        JsonObject jsonObject = JsonParser.parseString(apiResponse).getAsJsonObject();
        String base64HttpResponseBody = jsonObject.get("httpResponseBody").getAsString();
        byte[] httpResponseBodyBytes = Base64.getDecoder().decode(base64HttpResponseBody);
        String httpResponseBody = new String(httpResponseBodyBytes, StandardCharsets.UTF_8);
        JsonObject data = JsonParser.parseString(httpResponseBody).getAsJsonObject();
        String body = data.get("data").getAsString();
        System.out.println(body);
      }
    }
  }

  private static String buildAuthHeader() {
    String auth = API_KEY + ":";
    String encodedAuth = Base64.getEncoder().encodeToString(auth.getBytes());
    return "Basic " + encodedAuth;
  }
}
const axios = require('axios')

axios.post(
  'https://api.zyte.com/v1/extract',
  {
    url: 'https://httpbin.org/anything',
    httpResponseBody: true,
    httpRequestMethod: 'POST',
    httpRequestText: '{"foo": "bar"}'
  },
  {
    auth: { username: 'YOUR_API_KEY' }
  }
).then((response) => {
  const httpResponseBody = Buffer.from(
    response.data.httpResponseBody,
    'base64'
  )
  const body = JSON.parse(httpResponseBody).data
  console.log(body)
})
<?php

$client = new GuzzleHttp\Client();
$response = $client->request('POST', 'https://api.zyte.com/v1/extract', [
    'auth' => ['YOUR_API_KEY', ''],
    'headers' => ['Accept-Encoding' => 'gzip'],
    'json' => [
        'url' => 'https://httpbin.org/anything',
        'httpResponseBody' => true,
        'httpRequestMethod' => 'POST',
        'httpRequestText' => '{"foo": "bar"}',
    ],
]);
$data = json_decode($response->getBody());
$http_response_body = base64_decode($data->httpResponseBody);
$body = json_decode($http_response_body)->data;
echo $body.PHP_EOL;

With the proxy mode, the request body from your requests is used automatically, be it plain text or binary.

curl \
    --proxy api.zyte.com:8011 \
    --proxy-user YOUR_API_KEY: \
    --compressed \
    -X POST \
    -H "Content-Type: application/json" \
    --data '{"foo": "bar"}' \
    https://httpbin.org/anything \
    | jq .data
import json
from base64 import b64decode

import requests

api_response = requests.post(
    "https://api.zyte.com/v1/extract",
    auth=("YOUR_API_KEY", ""),
    json={
        "url": "https://httpbin.org/anything",
        "httpResponseBody": True,
        "httpRequestMethod": "POST",
        "httpRequestText": '{"foo": "bar"}',
    },
)
http_response_body = b64decode(api_response.json()["httpResponseBody"])
body: str = json.loads(http_response_body)["data"]
print(body)
import asyncio
import json
from base64 import b64decode

from zyte_api import AsyncZyteAPI


async def main():
    client = AsyncZyteAPI()
    api_response = await client.get(
        {
            "url": "https://httpbin.org/anything",
            "httpResponseBody": True,
            "httpRequestMethod": "POST",
            "httpRequestText": '{"foo": "bar"}',
        }
    )
    http_response_body = b64decode(api_response["httpResponseBody"])
    body = json.loads(http_response_body)["data"]
    print(body)


asyncio.run(main())
import json

from scrapy import Request, Spider


class HTTPBinOrgSpider(Spider):
    name = "httpbin_org"

    def start_requests(self):
        yield Request(
            "https://httpbin.org/anything",
            method="POST",
            body='{"foo": "bar"}',
        )

    def parse(self, response):
        body = json.loads(response.body)["data"]
        print(body)

Output:

{"foo": "bar"}
httpRequestBody example

Note

Install and configure code example requirements and the Zyte CA certificate to run the example below.

using System;
using System.Collections.Generic;
using System.Net;
using System.Net.Http;
using System.Text;
using System.Text.Json;
using System.Threading.Tasks;

HttpClientHandler handler = new HttpClientHandler()
{
    AutomaticDecompression = DecompressionMethods.All
};
HttpClient client = new HttpClient(handler);

var apiKey = "YOUR_API_KEY";
var bytes = Encoding.GetEncoding("ISO-8859-1").GetBytes(apiKey + ":");
var auth = System.Convert.ToBase64String(bytes);
client.DefaultRequestHeaders.Add("Authorization", "Basic " + auth);

client.DefaultRequestHeaders.Add("Accept-Encoding", "br, gzip, deflate");

var input = new Dictionary<string, object>(){
    {"url", "https://httpbin.org/anything"},
    {"httpResponseBody", true},
    {"httpRequestMethod", "POST"},
    {"httpRequestBody", "Zm9v"}
};
var inputJson = JsonSerializer.Serialize(input);
var content = new StringContent(inputJson, Encoding.UTF8, "application/json");

HttpResponseMessage response = await client.PostAsync("https://api.zyte.com/v1/extract", content);
var body = await response.Content.ReadAsByteArrayAsync();

var data = JsonDocument.Parse(body);
var base64HttpResponseBody = data.RootElement.GetProperty("httpResponseBody").ToString();
var httpResponseBody = System.Convert.FromBase64String(base64HttpResponseBody);

var responseData = JsonDocument.Parse(httpResponseBody);
var requestBody = responseData.RootElement.GetProperty("data").ToString();

Console.WriteLine(requestBody);
input.jsonl#
{"url": "https://httpbin.org/anything", "httpResponseBody": true, "httpRequestMethod": "POST", "httpRequestBody": "Zm9v"}
zyte-api input.jsonl \
    | jq --raw-output .httpResponseBody \
    | base64 --decode \
    | jq --raw-output .data
input.json#
{
    "url": "https://httpbin.org/anything",
    "httpResponseBody": true,
    "httpRequestMethod": "POST",
    "httpRequestBody": "Zm9v"
}
curl \
    --user YOUR_API_KEY: \
    --header 'Content-Type: application/json' \
    --data @input.json \
    --compressed \
    https://api.zyte.com/v1/extract \
| jq --raw-output .httpResponseBody \
| base64 --decode \
| jq --raw-output .data
import com.google.common.collect.ImmutableMap;
import com.google.gson.Gson;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Base64;
import java.util.Map;
import org.apache.hc.client5.http.classic.methods.HttpPost;
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
import org.apache.hc.client5.http.impl.classic.CloseableHttpResponse;
import org.apache.hc.client5.http.impl.classic.HttpClients;
import org.apache.hc.core5.http.ContentType;
import org.apache.hc.core5.http.HttpEntity;
import org.apache.hc.core5.http.HttpHeaders;
import org.apache.hc.core5.http.ParseException;
import org.apache.hc.core5.http.io.entity.EntityUtils;
import org.apache.hc.core5.http.io.entity.StringEntity;

class Example {
  private static final String API_KEY = "YOUR_API_KEY";

  public static void main(final String[] args)
      throws InterruptedException, IOException, ParseException {
    Map<String, Object> parameters =
        ImmutableMap.of(
            "url",
            "https://httpbin.org/anything",
            "httpResponseBody",
            true,
            "httpRequestMethod",
            "POST",
            "httpRequestBody",
            "Zm9v");
    String requestBody = new Gson().toJson(parameters);

    HttpPost request = new HttpPost("https://api.zyte.com/v1/extract");
    request.setHeader(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON);
    request.setHeader(HttpHeaders.ACCEPT_ENCODING, "gzip, deflate");
    request.setHeader(HttpHeaders.AUTHORIZATION, buildAuthHeader());
    request.setEntity(new StringEntity(requestBody));

    try (CloseableHttpClient client = HttpClients.createDefault()) {
      try (CloseableHttpResponse response = client.execute(request)) {
        HttpEntity entity = response.getEntity();
        String apiResponse = EntityUtils.toString(entity, StandardCharsets.UTF_8);
        JsonObject jsonObject = JsonParser.parseString(apiResponse).getAsJsonObject();
        String base64HttpResponseBody = jsonObject.get("httpResponseBody").getAsString();
        byte[] httpResponseBodyBytes = Base64.getDecoder().decode(base64HttpResponseBody);
        String httpResponseBody = new String(httpResponseBodyBytes, StandardCharsets.UTF_8);
        JsonObject data = JsonParser.parseString(httpResponseBody).getAsJsonObject();
        String body = data.get("data").getAsString();
        System.out.println(body);
      }
    }
  }

  private static String buildAuthHeader() {
    String auth = API_KEY + ":";
    String encodedAuth = Base64.getEncoder().encodeToString(auth.getBytes());
    return "Basic " + encodedAuth;
  }
}
const axios = require('axios')

axios.post(
  'https://api.zyte.com/v1/extract',
  {
    url: 'https://httpbin.org/anything',
    httpResponseBody: true,
    httpRequestMethod: 'POST',
    httpRequestBody: 'Zm9v'
  },
  {
    auth: { username: 'YOUR_API_KEY' }
  }
).then((response) => {
  const httpResponseBody = Buffer.from(
    response.data.httpResponseBody,
    'base64'
  )
  const body = JSON.parse(httpResponseBody).data
  console.log(body)
})
<?php

$client = new GuzzleHttp\Client();
$response = $client->request('POST', 'https://api.zyte.com/v1/extract', [
    'auth' => ['YOUR_API_KEY', ''],
    'headers' => ['Accept-Encoding' => 'gzip'],
    'json' => [
        'url' => 'https://httpbin.org/anything',
        'httpResponseBody' => true,
        'httpRequestMethod' => 'POST',
        'httpRequestBody' => 'Zm9v',
    ],
]);
$data = json_decode($response->getBody());
$http_response_body = base64_decode($data->httpResponseBody);
$body = json_decode($http_response_body)->data;
echo $body.PHP_EOL;

With the proxy mode, the request body from your requests is used automatically, be it plain text or binary.

curl \
    --proxy api.zyte.com:8011 \
    --proxy-user YOUR_API_KEY: \
    --compressed \
    -X POST \
    -H "Content-Type: application/octet-stream" \
    --data foo \
    https://httpbin.org/anything \
    | jq .data
import json
from base64 import b64decode

import requests

api_response = requests.post(
    "https://api.zyte.com/v1/extract",
    auth=("YOUR_API_KEY", ""),
    json={
        "url": "https://httpbin.org/anything",
        "httpResponseBody": True,
        "httpRequestMethod": "POST",
        "httpRequestBody": "Zm9v",
    },
)
http_response_body = b64decode(api_response.json()["httpResponseBody"])
body: str = json.loads(http_response_body)["data"]
print(body)
import asyncio
import json
from base64 import b64decode

from zyte_api import AsyncZyteAPI


async def main():
    client = AsyncZyteAPI()
    api_response = await client.get(
        {
            "url": "https://httpbin.org/anything",
            "httpResponseBody": True,
            "httpRequestMethod": "POST",
            "httpRequestBody": "Zm9v",
        }
    )
    http_response_body: bytes = b64decode(api_response["httpResponseBody"])
    body = json.loads(http_response_body)["data"]
    print(body)


asyncio.run(main())
import json

from scrapy import Request, Spider


class HTTPBinOrgSpider(Spider):
    name = "httpbin_org"

    def start_requests(self):
        yield Request(
            "https://httpbin.org/anything",
            method="POST",
            body=b"foo",
        )

    def parse(self, response):
        body = json.loads(response.body)["data"]
        print(body)

Output:

foo

Request headers#

In HTTP requests, use customHttpRequestHeaders to set request headers. You can set any header except Cookie (see Cookies).

Tip

You can also set headers like Accept, Accept-Encoding, Accept-Language or User-Agent, but it is usually best to let Zyte API set those headers; it will use values consistent with the network stack and other request parameters (e.g. device, geolocation).

Example

Note

Install and configure code example requirements and the Zyte CA certificate to run the example below.

using System.Collections.Generic;
using System.Net;
using System.Net.Http;
using System.Text;
using System.Text.Json;
using System.Threading.Tasks;

HttpClientHandler handler = new HttpClientHandler()
{
    AutomaticDecompression = DecompressionMethods.All
};
HttpClient client = new HttpClient(handler);

var apiKey = "YOUR_API_KEY";
var bytes = Encoding.GetEncoding("ISO-8859-1").GetBytes(apiKey + ":");
var auth = System.Convert.ToBase64String(bytes);
client.DefaultRequestHeaders.Add("Authorization", "Basic " + auth);

client.DefaultRequestHeaders.Add("Accept-Encoding", "br, gzip, deflate");

var input = new Dictionary<string, object>(){
    {"url", "https://httpbin.org/anything"},
    {"httpResponseBody", true},
    {
        "customHttpRequestHeaders",
        new List<Dictionary<string, object>>()
        {
            new Dictionary<string, object>()
            {
                {"name", "Accept-Language"},
                {"value", "fa"}
            }
        }
    }
};
var inputJson = JsonSerializer.Serialize(input);
var content = new StringContent(inputJson, Encoding.UTF8, "application/json");

HttpResponseMessage response = await client.PostAsync("https://api.zyte.com/v1/extract", content);
var body = await response.Content.ReadAsByteArrayAsync();

var data = JsonDocument.Parse(body);
var base64HttpResponseBody = data.RootElement.GetProperty("httpResponseBody").ToString();
var httpResponseBody = System.Convert.FromBase64String(base64HttpResponseBody);

var responseData = JsonDocument.Parse(httpResponseBody);
var headerEnumerator = responseData.RootElement.GetProperty("headers").EnumerateObject();
var headers = new Dictionary<string, string>();
while (headerEnumerator.MoveNext())
{
    headers.Add(
        headerEnumerator.Current.Name.ToString(),
        headerEnumerator.Current.Value.ToString()
    );
}
input.jsonl#
{"url": "https://httpbin.org/anything", "httpResponseBody": true, "customHttpRequestHeaders": [{"name": "Accept-Language", "value": "fa"}]}
zyte-api input.jsonl \
    | jq --raw-output .httpResponseBody \
    | base64 --decode \
    | jq .headers
input.json#
{
    "url": "https://httpbin.org/anything",
    "httpResponseBody": true,
    "customHttpRequestHeaders": [
        {
            "name": "Accept-Language",
            "value": "fa"
        }
    ]
}
curl \
    --user YOUR_API_KEY: \
    --header 'Content-Type: application/json' \
    --data @input.json \
    --compressed \
    https://api.zyte.com/v1/extract \
    | jq --raw-output .httpResponseBody \
    | base64 --decode \
    | jq .headers
import com.google.common.collect.ImmutableMap;
import com.google.gson.Gson;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Base64;
import java.util.Collections;
import java.util.Map;
import org.apache.hc.client5.http.classic.methods.HttpPost;
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
import org.apache.hc.client5.http.impl.classic.CloseableHttpResponse;
import org.apache.hc.client5.http.impl.classic.HttpClients;
import org.apache.hc.core5.http.ContentType;
import org.apache.hc.core5.http.HttpEntity;
import org.apache.hc.core5.http.HttpHeaders;
import org.apache.hc.core5.http.ParseException;
import org.apache.hc.core5.http.io.entity.EntityUtils;
import org.apache.hc.core5.http.io.entity.StringEntity;

class Example {
  private static final String API_KEY = "YOUR_API_KEY";

  public static void main(final String[] args)
      throws InterruptedException, IOException, ParseException {
    Map<String, Object> customHttpRequestHeader =
        ImmutableMap.of("name", "Accept-Language", "value", "fa");
    Map<String, Object> parameters =
        ImmutableMap.of(
            "url",
            "https://httpbin.org/anything",
            "httpResponseBody",
            true,
            "customHttpRequestHeaders",
            Collections.singletonList(customHttpRequestHeader));
    String requestBody = new Gson().toJson(parameters);

    HttpPost request = new HttpPost("https://api.zyte.com/v1/extract");
    request.setHeader(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON);
    request.setHeader(HttpHeaders.ACCEPT_ENCODING, "gzip, deflate");
    request.setHeader(HttpHeaders.AUTHORIZATION, buildAuthHeader());
    request.setEntity(new StringEntity(requestBody));

    try (CloseableHttpClient client = HttpClients.createDefault()) {
      try (CloseableHttpResponse response = client.execute(request)) {
        HttpEntity entity = response.getEntity();
        String apiResponse = EntityUtils.toString(entity, StandardCharsets.UTF_8);
        JsonObject jsonObject = JsonParser.parseString(apiResponse).getAsJsonObject();
        String base64HttpResponseBody = jsonObject.get("httpResponseBody").getAsString();
        byte[] httpResponseBodyBytes = Base64.getDecoder().decode(base64HttpResponseBody);
        String httpResponseBody = new String(httpResponseBodyBytes, StandardCharsets.UTF_8);
        JsonObject data = JsonParser.parseString(httpResponseBody).getAsJsonObject();
        JsonObject headers = data.get("headers").getAsJsonObject();
      }
    }
  }

  private static String buildAuthHeader() {
    String auth = API_KEY + ":";
    String encodedAuth = Base64.getEncoder().encodeToString(auth.getBytes());
    return "Basic " + encodedAuth;
  }
}
const axios = require('axios')

axios.post(
  'https://api.zyte.com/v1/extract',
  {
    url: 'https://httpbin.org/anything',
    httpResponseBody: true,
    customHttpRequestHeaders: [
      {
        name: 'Accept-Language',
        value: 'fa'
      }
    ]
  },
  {
    auth: { username: 'YOUR_API_KEY' }
  }
).then((response) => {
  const httpResponseBody = Buffer.from(
    response.data.httpResponseBody,
    'base64'
  )
  const headers = JSON.parse(httpResponseBody).headers
})
<?php

$client = new GuzzleHttp\Client();
$response = $client->request('POST', 'https://api.zyte.com/v1/extract', [
    'auth' => ['YOUR_API_KEY', ''],
    'headers' => ['Accept-Encoding' => 'gzip'],
    'json' => [
        'url' => 'https://httpbin.org/anything',
        'httpResponseBody' => true,
        'customHttpRequestHeaders' => [
            [
                'name' => 'Accept-Language',
                'value' => 'fa',
            ],
        ],
    ],
]);
$api = json_decode($response->getBody());
$http_response_body = base64_decode($api->httpResponseBody);
$data = json_decode($http_response_body);
$headers = $data->headers;

With the proxy mode, the request headers from your requests are used automatically.

curl \
    --proxy api.zyte.com:8011 \
    --proxy-user YOUR_API_KEY: \
    --compressed \
    -H "Accept-Language: fa" \
    https://httpbin.org/anything \
    | jq .headers
import json
from base64 import b64decode

import requests

api_response = requests.post(
    "https://api.zyte.com/v1/extract",
    auth=("YOUR_API_KEY", ""),
    json={
        "url": "https://httpbin.org/anything",
        "httpResponseBody": True,
        "customHttpRequestHeaders": [
            {
                "name": "Accept-Language",
                "value": "fa",
            },
        ],
    },
)
http_response_body = b64decode(api_response.json()["httpResponseBody"])
headers = json.loads(http_response_body)["headers"]
import asyncio
import json
from base64 import b64decode

from zyte_api import AsyncZyteAPI


async def main():
    client = AsyncZyteAPI()
    api_response = await client.get(
        {
            "url": "https://httpbin.org/anything",
            "httpResponseBody": True,
            "customHttpRequestHeaders": [
                {
                    "name": "Accept-Language",
                    "value": "fa",
                },
            ],
        }
    )
    http_response_body: bytes = b64decode(api_response["httpResponseBody"])
    headers = json.loads(http_response_body)["headers"]
    print(json.dumps(headers, indent=2))


asyncio.run(main())
import json

from scrapy import Request, Spider


class HTTPBinOrgSpider(Spider):
    name = "httpbin_org"

    def start_requests(self):
        yield Request(
            "https://httpbin.org/anything",
            headers={"Accept-Language": "fa"},
        )

    def parse(self, response):
        headers = json.loads(response.text)["headers"]

Output (first 5 lines):

{
  "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
  "Accept-Encoding": "gzip, deflate, br",
  "Accept-Language": "fa",
  "Host": "httpbin.org",

Redirection#

HTTP requests follow HTTP redirection by default. Set followRedirect to False to change that.

Device emulation#

In HTTP requests, use device to set a type of device emulation, either desktop (default) or mobile, to use for your request.

This option exists because some websites return different content depending on the type of device used to access them.

Note

In a request where you set device to mobile, you cannot use sessionContextParameters.actions.

Example

Note

Install and configure code example requirements and the Zyte CA certificate to run the example below.

using System;
using System.Collections.Generic;
using System.Net;
using System.Net.Http;
using System.Text;
using System.Text.Json;
using System.Threading.Tasks;

HttpClientHandler handler = new HttpClientHandler()
{
    AutomaticDecompression = DecompressionMethods.All
};
HttpClient client = new HttpClient(handler);

var apiKey = "YOUR_API_KEY";
var bytes = Encoding.GetEncoding("ISO-8859-1").GetBytes(apiKey + ":");
var auth = System.Convert.ToBase64String(bytes);
client.DefaultRequestHeaders.Add("Authorization", "Basic " + auth);

client.DefaultRequestHeaders.Add("Accept-Encoding", "br, gzip, deflate");

var input = new Dictionary<string, object>(){
    {"url", "https://httpbin.org/user-agent"},
    {"httpResponseBody", true},
    {"device", "mobile"}
};
var inputJson = JsonSerializer.Serialize(input);
var content = new StringContent(inputJson, Encoding.UTF8, "application/json");

HttpResponseMessage response = await client.PostAsync("https://api.zyte.com/v1/extract", content);
var body = await response.Content.ReadAsByteArrayAsync();

var data = JsonDocument.Parse(body);
var base64HttpResponseBody = data.RootElement.GetProperty("httpResponseBody").ToString();
var httpResponseBody = System.Convert.FromBase64String(base64HttpResponseBody);

var responseData = JsonDocument.Parse(httpResponseBody);
var headerEnumerator = responseData.RootElement.EnumerateObject();
while (headerEnumerator.MoveNext())
{
    if (headerEnumerator.Current.Name.ToString() == "user-agent")
    {
        Console.WriteLine(headerEnumerator.Current.Value.ToString());
    }
}
input.jsonl#
{"url": "https://httpbin.org/user-agent", "httpResponseBody": true, "device": "mobile"}
zyte-api input.jsonl \
    | jq --raw-output .httpResponseBody \
    | base64 --decode \
    | jq --raw-output '.["user-agent"]'
input.json#
{
    "url": "https://httpbin.org/user-agent",
    "httpResponseBody": true,
    "device": "mobile"
}
curl \
    --user YOUR_API_KEY: \
    --header 'Content-Type: application/json' \
    --data @input.json \
    --compressed \
    https://api.zyte.com/v1/extract \
    | jq --raw-output .httpResponseBody \
    | base64 --decode \
    | jq --raw-output '.["user-agent"]'
import com.google.common.collect.ImmutableMap;
import com.google.gson.Gson;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Base64;
import java.util.Map;
import org.apache.hc.client5.http.classic.methods.HttpPost;
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
import org.apache.hc.client5.http.impl.classic.CloseableHttpResponse;
import org.apache.hc.client5.http.impl.classic.HttpClients;
import org.apache.hc.core5.http.ContentType;
import org.apache.hc.core5.http.HttpEntity;
import org.apache.hc.core5.http.HttpHeaders;
import org.apache.hc.core5.http.ParseException;
import org.apache.hc.core5.http.io.entity.EntityUtils;
import org.apache.hc.core5.http.io.entity.StringEntity;

class Example {
  private static final String API_KEY = "YOUR_API_KEY";

  public static void main(final String[] args)
      throws InterruptedException, IOException, ParseException {
    Map<String, Object> parameters =
        ImmutableMap.of(
            "url", "https://httpbin.org/user-agent", "httpResponseBody", true, "device", "mobile");
    String requestBody = new Gson().toJson(parameters);

    HttpPost request = new HttpPost("https://api.zyte.com/v1/extract");
    request.setHeader(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON);
    request.setHeader(HttpHeaders.ACCEPT_ENCODING, "gzip, deflate");
    request.setHeader(HttpHeaders.AUTHORIZATION, buildAuthHeader());
    request.setEntity(new StringEntity(requestBody));

    try (CloseableHttpClient client = HttpClients.createDefault()) {
      try (CloseableHttpResponse response = client.execute(request)) {
        HttpEntity entity = response.getEntity();
        String apiResponse = EntityUtils.toString(entity, StandardCharsets.UTF_8);
        JsonObject jsonObject = JsonParser.parseString(apiResponse).getAsJsonObject();
        String base64HttpResponseBody = jsonObject.get("httpResponseBody").getAsString();
        byte[] httpResponseBodyBytes = Base64.getDecoder().decode(base64HttpResponseBody);
        String httpResponseBody = new String(httpResponseBodyBytes, StandardCharsets.UTF_8);
        JsonObject data = JsonParser.parseString(httpResponseBody).getAsJsonObject();
        String userAgent = data.get("user-agent").getAsString();
        System.out.println(userAgent);
      }
    }
  }

  private static String buildAuthHeader() {
    String auth = API_KEY + ":";
    String encodedAuth = Base64.getEncoder().encodeToString(auth.getBytes());
    return "Basic " + encodedAuth;
  }
}
const axios = require('axios')

axios.post(
  'https://api.zyte.com/v1/extract',
  {
    url: 'https://httpbin.org/user-agent',
    httpResponseBody: true,
    device: 'mobile'
  },
  {
    auth: { username: 'YOUR_API_KEY' }
  }
).then((response) => {
  const httpResponseBody = Buffer.from(
    response.data.httpResponseBody,
    'base64'
  )
  console.log(JSON.parse(httpResponseBody)['user-agent'])
})
<?php

$client = new GuzzleHttp\Client();
$response = $client->request('POST', 'https://api.zyte.com/v1/extract', [
    'auth' => ['YOUR_API_KEY', ''],
    'headers' => ['Accept-Encoding' => 'gzip'],
    'json' => [
        'url' => 'https://httpbin.org/user-agent',
        'httpResponseBody' => true,
        'device' => 'mobile',
    ],
]);
$api = json_decode($response->getBody());
$http_response_body = base64_decode($api->httpResponseBody);
$data = json_decode($http_response_body);
echo $data->{'user-agent'}."\n";

With the proxy mode, use the Zyte-Device header.

curl \
    --proxy api.zyte.com:8011 \
    --proxy-user YOUR_API_KEY: \
    --compressed \
    -H "Zyte-Device: mobile" \
    https://httpbin.org/user-agent \
    | jq --raw-output '.["user-agent"]'
import json
from base64 import b64decode

import requests

api_response = requests.post(
    "https://api.zyte.com/v1/extract",
    auth=("YOUR_API_KEY", ""),
    json={
        "url": "https://httpbin.org/user-agent",
        "httpResponseBody": True,
        "device": "mobile",
    },
)
http_response_body = b64decode(api_response.json()["httpResponseBody"])
user_agent = json.loads(http_response_body)["user-agent"]
print(user_agent)
import asyncio
import json
from base64 import b64decode

from zyte_api import AsyncZyteAPI


async def main():
    client = AsyncZyteAPI()
    api_response = await client.get(
        {
            "url": "https://httpbin.org/user-agent",
            "httpResponseBody": True,
            "device": "mobile",
        }
    )
    http_response_body: bytes = b64decode(api_response["httpResponseBody"])
    user_agent = json.loads(http_response_body)["user-agent"]
    print(user_agent)


asyncio.run(main())
import json

from scrapy import Request, Spider


class HTTPBinOrgSpider(Spider):
    name = "httpbin_org"

    def start_requests(self):
        yield Request(
            "https://httpbin.org/user-agent",
            meta={
                "zyte_api_automap": {
                    "device": "mobile",
                }
            },
        )

    def parse(self, response):
        user_agent = json.loads(response.text)["user-agent"]
        print(user_agent)

Example output (may vary):

Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Mobile Safari/537.36

Decoding HTML#

HTML extracted as a response body needs to be decoded.

HTML content can be encoded with one of many character encodings, and you must determine the character encoding used so that you can decode that HTML content accordingly.

The best way to determine the encoding of HTML content is to follow the encoding sniffing algorithm defined in the HTML standard.

In addition to the HTML content, the HTML encoding sniffing algorithm takes into account any character encoding provided in the optional charset parameter of media types declared in the Content-Type response header, so make sure you get the response headers in addition to the response body if you are following the HTML encoding sniffing algorithm.

Example

Note

Install and configure code example requirements and the Zyte CA certificate to run the example below.

Use file to find the media type of a previously-downloaded response based solely on its body (i.e. not following the HTML encoding sniffing algorithm).

file --mime-encoding output.html

Use content-type-parser, html-encoding-sniffer and whatwg-encoding:

const contentTypeParser = require('content-type-parser')
const htmlEncodingSniffer = require('html-encoding-sniffer')
const whatwgEncoding = require('whatwg-encoding')

// …

const httpResponseHeaders = response.data.httpResponseHeaders
let contentTypeCharset
httpResponseHeaders.forEach(function (item) {
  if (item.name.toLowerCase() === 'content-type') {
    contentTypeCharset = contentTypeParser(item.value).get('charset')
  }
})
const httpResponseBody = Buffer.from(response.data.httpResponseBody, 'base64')
const encoding = htmlEncodingSniffer(httpResponseBody, {
  transportLayerEncodingLabel: contentTypeCharset
})
const html = whatwgEncoding.decode(httpResponseBody, encoding)

web-poet provides a response wrapper that automatically decodes the response body following an encoding sniffing algorithm similar to the one defined in the HTML standard.

Provided that you have extracted a response with both body and headers, and you have Base64-decoded the response body, you can decode the HTML bytes as follows:

from web_poet import HttpResponse

# …

headers = tuple(
    (item['name'], item['value'])
    for item in http_response_headers
)
response = HttpResponse(
    url='https://example.com',
    body=http_response_body,
    status=200,
    headers=headers,
)
html = response.text

In transparent mode, regular Scrapy requests targeting HTML resources decode them by default. See Zyte API HTTP requests.

HTML and browser HTML#

HTML found in httpResponseBody is usually different from HTML found in browserHtml (browser HTML):

  • httpResponseBody does not reflect changes that a webpage makes at run time using JavaScript, such as loading content from additional URLs, or moving or reformatting content within the webpage.

  • browserHtml includes a normalization of the HTML from the underlying HTTP response, which web browsers perform according to the HTML5 specification. So the content of HTML and browser HTML could be different even when there is no JavaScript involved.

    Parsing HTML from httpResponseBody with libraries that do not implement HTML5 parsing, such as lxml.html (used by Scrapy by default), results in a different tree structure.

    With an HTML5-compatible parser the resulting tree structure would be the same, provided JavaScript does not cause any other difference.

Because of these differences, switching between these HTML inputs can break your existing parsing code and require changes, such as updating XPath or CSS selectors.