Zyte API HTTP requests#
To send HTTP requests through Zyte API, set the httpResponseBody
request field to true
, and read the Base64-encoded response body from the
httpResponseBody response field.
Example
Note
Install and configure code example requirements to run the example below.
using System.Collections.Generic;
using System.Net;
using System.Net.Http;
using System.Text;
using System.Text.Json;
using System.Threading.Tasks;
HttpClientHandler handler = new HttpClientHandler()
{
AutomaticDecompression = DecompressionMethods.All
};
HttpClient client = new HttpClient(handler);
var apiKey = "YOUR_API_KEY";
var bytes = Encoding.GetEncoding("ISO-8859-1").GetBytes(apiKey + ":");
var auth = System.Convert.ToBase64String(bytes);
client.DefaultRequestHeaders.Add("Authorization", "Basic " + auth);
client.DefaultRequestHeaders.Add("Accept-Encoding", "br, gzip, deflate");
var input = new Dictionary<string, object>(){
{"url", "https://toscrape.com"},
{"httpResponseBody", true}
};
var inputJson = JsonSerializer.Serialize(input);
var content = new StringContent(inputJson, Encoding.UTF8, "application/json");
HttpResponseMessage response = await client.PostAsync("https://api.zyte.com/v1/extract", content);
var body = await response.Content.ReadAsByteArrayAsync();
var data = JsonDocument.Parse(body);
var base64HttpResponseBody = data.RootElement.GetProperty("httpResponseBody").ToString();
var httpResponseBody = System.Convert.FromBase64String(base64HttpResponseBody);
{"url": "https://toscrape.com", "httpResponseBody": true}
zyte-api input.jsonl \
| jq --raw-output .httpResponseBody \
| base64 --decode \
> output.html
{
"url": "https://toscrape.com",
"httpResponseBody": true
}
curl \
--user YOUR_API_KEY: \
--header 'Content-Type: application/json' \
--data @input.json \
--compressed \
https://api.zyte.com/v1/extract \
| jq --raw-output .httpResponseBody \
| base64 --decode \
> output.html
import com.google.common.collect.ImmutableMap;
import com.google.gson.Gson;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Base64;
import java.util.Map;
import org.apache.hc.client5.http.classic.methods.HttpPost;
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
import org.apache.hc.client5.http.impl.classic.CloseableHttpResponse;
import org.apache.hc.client5.http.impl.classic.HttpClients;
import org.apache.hc.core5.http.ContentType;
import org.apache.hc.core5.http.HttpEntity;
import org.apache.hc.core5.http.HttpHeaders;
import org.apache.hc.core5.http.ParseException;
import org.apache.hc.core5.http.io.entity.EntityUtils;
import org.apache.hc.core5.http.io.entity.StringEntity;
class Example {
private static final String API_KEY = "YOUR_API_KEY";
public static void main(final String[] args)
throws InterruptedException, IOException, ParseException {
Map<String, Object> parameters =
ImmutableMap.of("url", "https://toscrape.com", "httpResponseBody", true);
String requestBody = new Gson().toJson(parameters);
HttpPost request = new HttpPost("https://api.zyte.com/v1/extract");
request.setHeader(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON);
request.setHeader(HttpHeaders.ACCEPT_ENCODING, "gzip, deflate");
request.setHeader(HttpHeaders.AUTHORIZATION, buildAuthHeader());
request.setEntity(new StringEntity(requestBody));
try (CloseableHttpClient client = HttpClients.createDefault()) {
try (CloseableHttpResponse response = client.execute(request)) {
HttpEntity entity = response.getEntity();
String apiResponse = EntityUtils.toString(entity, StandardCharsets.UTF_8);
JsonObject jsonObject = JsonParser.parseString(apiResponse).getAsJsonObject();
String base64HttpResponseBody = jsonObject.get("httpResponseBody").getAsString();
byte[] httpResponseBodyBytes = Base64.getDecoder().decode(base64HttpResponseBody);
String httpResponseBody = new String(httpResponseBodyBytes, StandardCharsets.UTF_8);
}
}
}
private static String buildAuthHeader() {
String auth = API_KEY + ":";
String encodedAuth = Base64.getEncoder().encodeToString(auth.getBytes());
return "Basic " + encodedAuth;
}
}
const axios = require('axios')
axios.post(
'https://api.zyte.com/v1/extract',
{
url: 'https://toscrape.com',
httpResponseBody: true
},
{
auth: { username: 'YOUR_API_KEY' }
}
).then((response) => {
const httpResponseBody = Buffer.from(
response.data.httpResponseBody,
'base64'
)
})
<?php
$client = new GuzzleHttp\Client();
$response = $client->request('POST', 'https://api.zyte.com/v1/extract', [
'auth' => ['YOUR_API_KEY', ''],
'headers' => ['Accept-Encoding' => 'gzip'],
'json' => [
'url' => 'https://toscrape.com',
'httpResponseBody' => true,
],
]);
$data = json_decode($response->getBody());
$http_response_body = base64_decode($data->httpResponseBody);
from base64 import b64decode
import requests
api_response = requests.post(
"https://api.zyte.com/v1/extract",
auth=("YOUR_API_KEY", ""),
json={
"url": "https://toscrape.com",
"httpResponseBody": True,
},
)
http_response_body: bytes = b64decode(api_response.json()["httpResponseBody"])
import asyncio
from base64 import b64decode
from zyte_api.aio.client import AsyncClient
async def main():
client = AsyncClient()
api_response = await client.request_raw(
{
"url": "https://toscrape.com",
"httpResponseBody": True,
}
)
http_response_body: bytes = b64decode(api_response["httpResponseBody"])
asyncio.run(main())
In transparent mode, when you target a text resource (e.g. HTML, JSON), regular Scrapy requests work out of the box:
from scrapy import Spider
class ToScrapeSpider(Spider):
name = "toscrape_com"
start_urls = ["https://toscrape.com"]
def parse(self, response):
http_response_text: str = response.text
While regular Scrapy requests also work for binary responses at the
moment, they may stop working in future versions of scrapy-zyte-api, so
passing httpResponseBody
is recommended when targeting binary
resources:
from scrapy import Request, Spider
class ToScrapeSpider(Spider):
name = "toscrape_com"
def start_requests(self):
yield Request(
"https://toscrape.com",
meta={
"zyte_api_automap": {
"httpResponseBody": True,
},
},
)
def parse(self, response):
http_response_body: bytes = response.body
Output (first 5 lines):
<!DOCTYPE html>
<html lang="en">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<title>Scraping Sandbox</title>
For HTTP requests, Zyte API also supports:
Tip
HTTP responses do not reflect HTML content rendered by a web browser that executes JavaScript code. To get browser HTML, use a browser request. See also HTML and browser HTML.
Request method#
HTTP requests use the GET
HTTP method by default. Use the
httpRequestMethod field to set a different HTTP method.
Example
Note
Install and configure code example requirements to run the example below.
using System.Collections.Generic;
using System.Net;
using System.Net.Http;
using System.Text;
using System.Text.Json;
using System.Threading.Tasks;
HttpClientHandler handler = new HttpClientHandler()
{
AutomaticDecompression = DecompressionMethods.All
};
HttpClient client = new HttpClient(handler);
var apiKey = "YOUR_API_KEY";
var bytes = Encoding.GetEncoding("ISO-8859-1").GetBytes(apiKey + ":");
var auth = System.Convert.ToBase64String(bytes);
client.DefaultRequestHeaders.Add("Authorization", "Basic " + auth);
client.DefaultRequestHeaders.Add("Accept-Encoding", "br, gzip, deflate");
var input = new Dictionary<string, object>(){
{"url", "https://httpbin.org/anything"},
{"httpResponseBody", true},
{"httpRequestMethod", "POST"}
};
var inputJson = JsonSerializer.Serialize(input);
var content = new StringContent(inputJson, Encoding.UTF8, "application/json");
HttpResponseMessage response = await client.PostAsync("https://api.zyte.com/v1/extract", content);
var body = await response.Content.ReadAsByteArrayAsync();
var data = JsonDocument.Parse(body);
var base64HttpResponseBody = data.RootElement.GetProperty("httpResponseBody").ToString();
var httpResponseBody = System.Convert.FromBase64String(base64HttpResponseBody);
var responseData = JsonDocument.Parse(httpResponseBody);
var method = responseData.RootElement.GetProperty("method").ToString();
{"url": "https://httpbin.org/anything", "httpResponseBody": true, "httpRequestMethod": "POST"}
zyte-api input.jsonl \
| jq --raw-output .httpResponseBody \
| base64 --decode \
| jq .method
{
"url": "https://httpbin.org/anything",
"httpResponseBody": true,
"httpRequestMethod": "POST"
}
curl \
--user YOUR_API_KEY: \
--header 'Content-Type: application/json' \
--data @input.json \
--compressed \
https://api.zyte.com/v1/extract \
| jq --raw-output .httpResponseBody \
| base64 --decode \
| jq .method
import com.google.common.collect.ImmutableMap;
import com.google.gson.Gson;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Base64;
import java.util.Map;
import org.apache.hc.client5.http.classic.methods.HttpPost;
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
import org.apache.hc.client5.http.impl.classic.CloseableHttpResponse;
import org.apache.hc.client5.http.impl.classic.HttpClients;
import org.apache.hc.core5.http.ContentType;
import org.apache.hc.core5.http.HttpEntity;
import org.apache.hc.core5.http.HttpHeaders;
import org.apache.hc.core5.http.ParseException;
import org.apache.hc.core5.http.io.entity.EntityUtils;
import org.apache.hc.core5.http.io.entity.StringEntity;
class Example {
private static final String API_KEY = "YOUR_API_KEY";
public static void main(final String[] args)
throws InterruptedException, IOException, ParseException {
Map<String, Object> parameters =
ImmutableMap.of(
"url",
"https://httpbin.org/anything",
"httpResponseBody",
true,
"httpRequestMethod",
"POST");
String requestBody = new Gson().toJson(parameters);
HttpPost request = new HttpPost("https://api.zyte.com/v1/extract");
request.setHeader(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON);
request.setHeader(HttpHeaders.ACCEPT_ENCODING, "gzip, deflate");
request.setHeader(HttpHeaders.AUTHORIZATION, buildAuthHeader());
request.setEntity(new StringEntity(requestBody));
try (CloseableHttpClient client = HttpClients.createDefault()) {
try (CloseableHttpResponse response = client.execute(request)) {
HttpEntity entity = response.getEntity();
String apiResponse = EntityUtils.toString(entity, StandardCharsets.UTF_8);
JsonObject jsonObject = JsonParser.parseString(apiResponse).getAsJsonObject();
String base64HttpResponseBody = jsonObject.get("httpResponseBody").getAsString();
byte[] httpResponseBodyBytes = Base64.getDecoder().decode(base64HttpResponseBody);
String httpResponseBody = new String(httpResponseBodyBytes, StandardCharsets.UTF_8);
JsonObject data = JsonParser.parseString(httpResponseBody).getAsJsonObject();
String method = data.get("method").getAsString();
}
}
}
private static String buildAuthHeader() {
String auth = API_KEY + ":";
String encodedAuth = Base64.getEncoder().encodeToString(auth.getBytes());
return "Basic " + encodedAuth;
}
}
const axios = require('axios')
axios.post(
'https://api.zyte.com/v1/extract',
{
url: 'https://httpbin.org/anything',
httpResponseBody: true,
httpRequestMethod: 'POST'
},
{
auth: { username: 'YOUR_API_KEY' }
}
).then((response) => {
const httpResponseBody = Buffer.from(
response.data.httpResponseBody,
'base64'
)
const method = JSON.parse(httpResponseBody).method
})
<?php
$client = new GuzzleHttp\Client();
$response = $client->request('POST', 'https://api.zyte.com/v1/extract', [
'auth' => ['YOUR_API_KEY', ''],
'headers' => ['Accept-Encoding' => 'gzip'],
'json' => [
'url' => 'https://httpbin.org/anything',
'httpResponseBody' => true,
'httpRequestMethod' => 'POST',
],
]);
$data = json_decode($response->getBody());
$http_response_body = base64_decode($data->httpResponseBody);
$method = json_decode($http_response_body)->method;
import json
from base64 import b64decode
import requests
api_response = requests.post(
"https://api.zyte.com/v1/extract",
auth=("YOUR_API_KEY", ""),
json={
"url": "https://httpbin.org/anything",
"httpResponseBody": True,
"httpRequestMethod": "POST",
},
)
http_response_body = b64decode(api_response.json()["httpResponseBody"])
method = json.loads(http_response_body)["method"]
import asyncio
import json
from base64 import b64decode
from zyte_api.aio.client import AsyncClient
async def main():
client = AsyncClient()
api_response = await client.request_raw(
{
"url": "https://httpbin.org/anything",
"httpResponseBody": True,
"httpRequestMethod": "POST",
}
)
http_response_body: bytes = b64decode(api_response["httpResponseBody"])
method = json.loads(http_response_body)["method"]
asyncio.run(main())
import json
from scrapy import Request, Spider
class HTTPBinOrgSpider(Spider):
name = "httpbin_org"
def start_requests(self):
yield Request(
"https://httpbin.org/anything",
method="POST",
)
def parse(self, response):
method = json.loads(response.text)["method"]
Output:
"POST"
Request body#
To include a body in your request, use one of the following fields:
httpRequestText, for UTF-8-encoded text.
httpRequestBody, for anything else. It supports binary data as well, so the value must be Base64-encoded.
httpRequestText
example
Note
Install and configure code example requirements to run the example below.
using System;
using System.Collections.Generic;
using System.Net;
using System.Net.Http;
using System.Text;
using System.Text.Json;
using System.Threading.Tasks;
HttpClientHandler handler = new HttpClientHandler()
{
AutomaticDecompression = DecompressionMethods.All
};
HttpClient client = new HttpClient(handler);
var apiKey = "YOUR_API_KEY";
var bytes = Encoding.GetEncoding("ISO-8859-1").GetBytes(apiKey + ":");
var auth = System.Convert.ToBase64String(bytes);
client.DefaultRequestHeaders.Add("Authorization", "Basic " + auth);
client.DefaultRequestHeaders.Add("Accept-Encoding", "br, gzip, deflate");
var input = new Dictionary<string, object>(){
{"url", "https://httpbin.org/anything"},
{"httpResponseBody", true},
{"httpRequestMethod", "POST"},
{"httpRequestText", "{\"foo\": \"bar\"}"}
};
var inputJson = JsonSerializer.Serialize(input);
var content = new StringContent(inputJson, Encoding.UTF8, "application/json");
HttpResponseMessage response = await client.PostAsync("https://api.zyte.com/v1/extract", content);
var body = await response.Content.ReadAsByteArrayAsync();
var data = JsonDocument.Parse(body);
var base64HttpResponseBody = data.RootElement.GetProperty("httpResponseBody").ToString();
var httpResponseBody = System.Convert.FromBase64String(base64HttpResponseBody);
var responseData = JsonDocument.Parse(httpResponseBody);
var requestBody = responseData.RootElement.GetProperty("data").ToString();
Console.WriteLine(requestBody);
{"url": "https://httpbin.org/anything", "httpResponseBody": true, "httpRequestMethod": "POST", "httpRequestText": "{\"foo\": \"bar\"}"}
zyte-api input.jsonl \
| jq --raw-output .httpResponseBody \
| base64 --decode \
| jq --raw-output .data
{
"url": "https://httpbin.org/anything",
"httpResponseBody": true,
"httpRequestMethod": "POST",
"httpRequestText": "{\"foo\": \"bar\"}"
}
curl \
--user YOUR_API_KEY: \
--header 'Content-Type: application/json' \
--data @input.json \
--compressed \
https://api.zyte.com/v1/extract \
| jq --raw-output .httpResponseBody \
| base64 --decode \
| jq --raw-output .data
import com.google.common.collect.ImmutableMap;
import com.google.gson.Gson;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Base64;
import java.util.Map;
import org.apache.hc.client5.http.classic.methods.HttpPost;
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
import org.apache.hc.client5.http.impl.classic.CloseableHttpResponse;
import org.apache.hc.client5.http.impl.classic.HttpClients;
import org.apache.hc.core5.http.ContentType;
import org.apache.hc.core5.http.HttpEntity;
import org.apache.hc.core5.http.HttpHeaders;
import org.apache.hc.core5.http.ParseException;
import org.apache.hc.core5.http.io.entity.EntityUtils;
import org.apache.hc.core5.http.io.entity.StringEntity;
class Example {
private static final String API_KEY = "YOUR_API_KEY";
public static void main(final String[] args)
throws InterruptedException, IOException, ParseException {
Map<String, Object> parameters =
ImmutableMap.of(
"url",
"https://httpbin.org/anything",
"httpResponseBody",
true,
"httpRequestMethod",
"POST",
"httpRequestText",
"{\"foo\": \"bar\"}");
String requestBody = new Gson().toJson(parameters);
HttpPost request = new HttpPost("https://api.zyte.com/v1/extract");
request.setHeader(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON);
request.setHeader(HttpHeaders.ACCEPT_ENCODING, "gzip, deflate");
request.setHeader(HttpHeaders.AUTHORIZATION, buildAuthHeader());
request.setEntity(new StringEntity(requestBody));
try (CloseableHttpClient client = HttpClients.createDefault()) {
try (CloseableHttpResponse response = client.execute(request)) {
HttpEntity entity = response.getEntity();
String apiResponse = EntityUtils.toString(entity, StandardCharsets.UTF_8);
JsonObject jsonObject = JsonParser.parseString(apiResponse).getAsJsonObject();
String base64HttpResponseBody = jsonObject.get("httpResponseBody").getAsString();
byte[] httpResponseBodyBytes = Base64.getDecoder().decode(base64HttpResponseBody);
String httpResponseBody = new String(httpResponseBodyBytes, StandardCharsets.UTF_8);
JsonObject data = JsonParser.parseString(httpResponseBody).getAsJsonObject();
String body = data.get("data").getAsString();
System.out.println(body);
}
}
}
private static String buildAuthHeader() {
String auth = API_KEY + ":";
String encodedAuth = Base64.getEncoder().encodeToString(auth.getBytes());
return "Basic " + encodedAuth;
}
}
const axios = require('axios')
axios.post(
'https://api.zyte.com/v1/extract',
{
url: 'https://httpbin.org/anything',
httpResponseBody: true,
httpRequestMethod: 'POST',
httpRequestText: '{"foo": "bar"}'
},
{
auth: { username: 'YOUR_API_KEY' }
}
).then((response) => {
const httpResponseBody = Buffer.from(
response.data.httpResponseBody,
'base64'
)
const body = JSON.parse(httpResponseBody).data
console.log(body)
})
<?php
$client = new GuzzleHttp\Client();
$response = $client->request('POST', 'https://api.zyte.com/v1/extract', [
'auth' => ['YOUR_API_KEY', ''],
'headers' => ['Accept-Encoding' => 'gzip'],
'json' => [
'url' => 'https://httpbin.org/anything',
'httpResponseBody' => true,
'httpRequestMethod' => 'POST',
'httpRequestText' => '{"foo": "bar"}',
],
]);
$data = json_decode($response->getBody());
$http_response_body = base64_decode($data->httpResponseBody);
$body = json_decode($http_response_body)->data;
echo $body.PHP_EOL;
import json
from base64 import b64decode
import requests
api_response = requests.post(
"https://api.zyte.com/v1/extract",
auth=("YOUR_API_KEY", ""),
json={
"url": "https://httpbin.org/anything",
"httpResponseBody": True,
"httpRequestMethod": "POST",
"httpRequestText": '{"foo": "bar"}',
},
)
http_response_body = b64decode(api_response.json()["httpResponseBody"])
body: str = json.loads(http_response_body)["data"]
print(body)
import asyncio
import json
from base64 import b64decode
from zyte_api.aio.client import AsyncClient
async def main():
client = AsyncClient()
api_response = await client.request_raw(
{
"url": "https://httpbin.org/anything",
"httpResponseBody": True,
"httpRequestMethod": "POST",
"httpRequestText": '{"foo": "bar"}',
}
)
http_response_body: bytes = b64decode(api_response["httpResponseBody"])
body: str = json.loads(http_response_body)["data"]
print(body)
asyncio.run(main())
import json
from scrapy import Request, Spider
class HTTPBinOrgSpider(Spider):
name = "httpbin_org"
def start_requests(self):
yield Request(
"https://httpbin.org/anything",
method="POST",
body='{"foo": "bar"}',
)
def parse(self, response):
body = json.loads(response.body)["data"]
print(body)
Output:
{"foo": "bar"}
httpRequestBody
example
Note
Install and configure code example requirements to run the example below.
using System;
using System.Collections.Generic;
using System.Net;
using System.Net.Http;
using System.Text;
using System.Text.Json;
using System.Threading.Tasks;
HttpClientHandler handler = new HttpClientHandler()
{
AutomaticDecompression = DecompressionMethods.All
};
HttpClient client = new HttpClient(handler);
var apiKey = "YOUR_API_KEY";
var bytes = Encoding.GetEncoding("ISO-8859-1").GetBytes(apiKey + ":");
var auth = System.Convert.ToBase64String(bytes);
client.DefaultRequestHeaders.Add("Authorization", "Basic " + auth);
client.DefaultRequestHeaders.Add("Accept-Encoding", "br, gzip, deflate");
var input = new Dictionary<string, object>(){
{"url", "https://httpbin.org/anything"},
{"httpResponseBody", true},
{"httpRequestMethod", "POST"},
{"httpRequestBody", "Zm9v"}
};
var inputJson = JsonSerializer.Serialize(input);
var content = new StringContent(inputJson, Encoding.UTF8, "application/json");
HttpResponseMessage response = await client.PostAsync("https://api.zyte.com/v1/extract", content);
var body = await response.Content.ReadAsByteArrayAsync();
var data = JsonDocument.Parse(body);
var base64HttpResponseBody = data.RootElement.GetProperty("httpResponseBody").ToString();
var httpResponseBody = System.Convert.FromBase64String(base64HttpResponseBody);
var responseData = JsonDocument.Parse(httpResponseBody);
var requestBody = responseData.RootElement.GetProperty("data").ToString();
Console.WriteLine(requestBody);
{"url": "https://httpbin.org/anything", "httpResponseBody": true, "httpRequestMethod": "POST", "httpRequestBody": "Zm9v"}
zyte-api input.jsonl \
| jq --raw-output .httpResponseBody \
| base64 --decode \
| jq --raw-output .data
{
"url": "https://httpbin.org/anything",
"httpResponseBody": true,
"httpRequestMethod": "POST",
"httpRequestBody": "Zm9v"
}
curl \
--user YOUR_API_KEY: \
--header 'Content-Type: application/json' \
--data @input.json \
--compressed \
https://api.zyte.com/v1/extract \
| jq --raw-output .httpResponseBody \
| base64 --decode \
| jq --raw-output .data
import com.google.common.collect.ImmutableMap;
import com.google.gson.Gson;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Base64;
import java.util.Map;
import org.apache.hc.client5.http.classic.methods.HttpPost;
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
import org.apache.hc.client5.http.impl.classic.CloseableHttpResponse;
import org.apache.hc.client5.http.impl.classic.HttpClients;
import org.apache.hc.core5.http.ContentType;
import org.apache.hc.core5.http.HttpEntity;
import org.apache.hc.core5.http.HttpHeaders;
import org.apache.hc.core5.http.ParseException;
import org.apache.hc.core5.http.io.entity.EntityUtils;
import org.apache.hc.core5.http.io.entity.StringEntity;
class Example {
private static final String API_KEY = "YOUR_API_KEY";
public static void main(final String[] args)
throws InterruptedException, IOException, ParseException {
Map<String, Object> parameters =
ImmutableMap.of(
"url",
"https://httpbin.org/anything",
"httpResponseBody",
true,
"httpRequestMethod",
"POST",
"httpRequestBody",
"Zm9v");
String requestBody = new Gson().toJson(parameters);
HttpPost request = new HttpPost("https://api.zyte.com/v1/extract");
request.setHeader(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON);
request.setHeader(HttpHeaders.ACCEPT_ENCODING, "gzip, deflate");
request.setHeader(HttpHeaders.AUTHORIZATION, buildAuthHeader());
request.setEntity(new StringEntity(requestBody));
try (CloseableHttpClient client = HttpClients.createDefault()) {
try (CloseableHttpResponse response = client.execute(request)) {
HttpEntity entity = response.getEntity();
String apiResponse = EntityUtils.toString(entity, StandardCharsets.UTF_8);
JsonObject jsonObject = JsonParser.parseString(apiResponse).getAsJsonObject();
String base64HttpResponseBody = jsonObject.get("httpResponseBody").getAsString();
byte[] httpResponseBodyBytes = Base64.getDecoder().decode(base64HttpResponseBody);
String httpResponseBody = new String(httpResponseBodyBytes, StandardCharsets.UTF_8);
JsonObject data = JsonParser.parseString(httpResponseBody).getAsJsonObject();
String body = data.get("data").getAsString();
System.out.println(body);
}
}
}
private static String buildAuthHeader() {
String auth = API_KEY + ":";
String encodedAuth = Base64.getEncoder().encodeToString(auth.getBytes());
return "Basic " + encodedAuth;
}
}
const axios = require('axios')
axios.post(
'https://api.zyte.com/v1/extract',
{
url: 'https://httpbin.org/anything',
httpResponseBody: true,
httpRequestMethod: 'POST',
httpRequestBody: 'Zm9v'
},
{
auth: { username: 'YOUR_API_KEY' }
}
).then((response) => {
const httpResponseBody = Buffer.from(
response.data.httpResponseBody,
'base64'
)
const body = JSON.parse(httpResponseBody).data
console.log(body)
})
<?php
$client = new GuzzleHttp\Client();
$response = $client->request('POST', 'https://api.zyte.com/v1/extract', [
'auth' => ['YOUR_API_KEY', ''],
'headers' => ['Accept-Encoding' => 'gzip'],
'json' => [
'url' => 'https://httpbin.org/anything',
'httpResponseBody' => true,
'httpRequestMethod' => 'POST',
'httpRequestBody' => 'Zm9v',
],
]);
$data = json_decode($response->getBody());
$http_response_body = base64_decode($data->httpResponseBody);
$body = json_decode($http_response_body)->data;
echo $body.PHP_EOL;
import json
from base64 import b64decode
import requests
api_response = requests.post(
"https://api.zyte.com/v1/extract",
auth=("YOUR_API_KEY", ""),
json={
"url": "https://httpbin.org/anything",
"httpResponseBody": True,
"httpRequestMethod": "POST",
"httpRequestBody": "Zm9v",
},
)
http_response_body = b64decode(api_response.json()["httpResponseBody"])
body: str = json.loads(http_response_body)["data"]
print(body)
import asyncio
import json
from base64 import b64decode
from zyte_api.aio.client import AsyncClient
async def main():
client = AsyncClient()
api_response = await client.request_raw(
{
"url": "https://httpbin.org/anything",
"httpResponseBody": True,
"httpRequestMethod": "POST",
"httpRequestBody": "Zm9v",
}
)
http_response_body: bytes = b64decode(api_response["httpResponseBody"])
body: str = json.loads(http_response_body)["data"]
print(body)
asyncio.run(main())
import json
from scrapy import Request, Spider
class HTTPBinOrgSpider(Spider):
name = "httpbin_org"
def start_requests(self):
yield Request(
"https://httpbin.org/anything",
method="POST",
body=b"foo",
)
def parse(self, response):
body = json.loads(response.body)["data"]
print(body)
Output:
foo
Request headers#
In HTTP requests, use customHttpRequestHeaders to set request headers.
Example
Note
Install and configure code example requirements to run the example below.
using System.Collections.Generic;
using System.Net;
using System.Net.Http;
using System.Text;
using System.Text.Json;
using System.Threading.Tasks;
HttpClientHandler handler = new HttpClientHandler()
{
AutomaticDecompression = DecompressionMethods.All
};
HttpClient client = new HttpClient(handler);
var apiKey = "YOUR_API_KEY";
var bytes = Encoding.GetEncoding("ISO-8859-1").GetBytes(apiKey + ":");
var auth = System.Convert.ToBase64String(bytes);
client.DefaultRequestHeaders.Add("Authorization", "Basic " + auth);
client.DefaultRequestHeaders.Add("Accept-Encoding", "br, gzip, deflate");
var input = new Dictionary<string, object>(){
{"url", "https://httpbin.org/anything"},
{"httpResponseBody", true},
{
"customHttpRequestHeaders",
new List<Dictionary<string, object>>()
{
new Dictionary<string, object>()
{
{"name", "Accept-Language"},
{"value", "fa"}
}
}
}
};
var inputJson = JsonSerializer.Serialize(input);
var content = new StringContent(inputJson, Encoding.UTF8, "application/json");
HttpResponseMessage response = await client.PostAsync("https://api.zyte.com/v1/extract", content);
var body = await response.Content.ReadAsByteArrayAsync();
var data = JsonDocument.Parse(body);
var base64HttpResponseBody = data.RootElement.GetProperty("httpResponseBody").ToString();
var httpResponseBody = System.Convert.FromBase64String(base64HttpResponseBody);
var responseData = JsonDocument.Parse(httpResponseBody);
var headerEnumerator = responseData.RootElement.GetProperty("headers").EnumerateObject();
var headers = new Dictionary<string, string>();
while (headerEnumerator.MoveNext())
{
headers.Add(
headerEnumerator.Current.Name.ToString(),
headerEnumerator.Current.Value.ToString()
);
}
{"url": "https://httpbin.org/anything", "httpResponseBody": true, "customHttpRequestHeaders": [{"name": "Accept-Language", "value": "fa"}]}
zyte-api input.jsonl \
| jq --raw-output .httpResponseBody \
| base64 --decode \
| jq .headers
{
"url": "https://httpbin.org/anything",
"httpResponseBody": true,
"customHttpRequestHeaders": [
{
"name": "Accept-Language",
"value": "fa"
}
]
}
curl \
--user YOUR_API_KEY: \
--header 'Content-Type: application/json' \
--data @input.json \
--compressed \
https://api.zyte.com/v1/extract \
| jq --raw-output .httpResponseBody \
| base64 --decode \
| jq .headers
import com.google.common.collect.ImmutableMap;
import com.google.gson.Gson;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Base64;
import java.util.Collections;
import java.util.Map;
import org.apache.hc.client5.http.classic.methods.HttpPost;
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
import org.apache.hc.client5.http.impl.classic.CloseableHttpResponse;
import org.apache.hc.client5.http.impl.classic.HttpClients;
import org.apache.hc.core5.http.ContentType;
import org.apache.hc.core5.http.HttpEntity;
import org.apache.hc.core5.http.HttpHeaders;
import org.apache.hc.core5.http.ParseException;
import org.apache.hc.core5.http.io.entity.EntityUtils;
import org.apache.hc.core5.http.io.entity.StringEntity;
class Example {
private static final String API_KEY = "YOUR_API_KEY";
public static void main(final String[] args)
throws InterruptedException, IOException, ParseException {
Map<String, Object> customHttpRequestHeader =
ImmutableMap.of("name", "Accept-Language", "value", "fa");
Map<String, Object> parameters =
ImmutableMap.of(
"url",
"https://httpbin.org/anything",
"httpResponseBody",
true,
"customHttpRequestHeaders",
Collections.singletonList(customHttpRequestHeader));
String requestBody = new Gson().toJson(parameters);
HttpPost request = new HttpPost("https://api.zyte.com/v1/extract");
request.setHeader(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON);
request.setHeader(HttpHeaders.ACCEPT_ENCODING, "gzip, deflate");
request.setHeader(HttpHeaders.AUTHORIZATION, buildAuthHeader());
request.setEntity(new StringEntity(requestBody));
try (CloseableHttpClient client = HttpClients.createDefault()) {
try (CloseableHttpResponse response = client.execute(request)) {
HttpEntity entity = response.getEntity();
String apiResponse = EntityUtils.toString(entity, StandardCharsets.UTF_8);
JsonObject jsonObject = JsonParser.parseString(apiResponse).getAsJsonObject();
String base64HttpResponseBody = jsonObject.get("httpResponseBody").getAsString();
byte[] httpResponseBodyBytes = Base64.getDecoder().decode(base64HttpResponseBody);
String httpResponseBody = new String(httpResponseBodyBytes, StandardCharsets.UTF_8);
JsonObject data = JsonParser.parseString(httpResponseBody).getAsJsonObject();
JsonObject headers = data.get("headers").getAsJsonObject();
}
}
}
private static String buildAuthHeader() {
String auth = API_KEY + ":";
String encodedAuth = Base64.getEncoder().encodeToString(auth.getBytes());
return "Basic " + encodedAuth;
}
}
const axios = require('axios')
axios.post(
'https://api.zyte.com/v1/extract',
{
url: 'https://httpbin.org/anything',
httpResponseBody: true,
customHttpRequestHeaders: [
{
name: 'Accept-Language',
value: 'fa'
}
]
},
{
auth: { username: 'YOUR_API_KEY' }
}
).then((response) => {
const httpResponseBody = Buffer.from(
response.data.httpResponseBody,
'base64'
)
const headers = JSON.parse(httpResponseBody).headers
})
<?php
$client = new GuzzleHttp\Client();
$response = $client->request('POST', 'https://api.zyte.com/v1/extract', [
'auth' => ['YOUR_API_KEY', ''],
'headers' => ['Accept-Encoding' => 'gzip'],
'json' => [
'url' => 'https://httpbin.org/anything',
'httpResponseBody' => true,
'customHttpRequestHeaders' => [
[
'name' => 'Accept-Language',
'value' => 'fa',
],
],
],
]);
$api = json_decode($response->getBody());
$http_response_body = base64_decode($api->httpResponseBody);
$data = json_decode($http_response_body);
$headers = $data->headers;
import json
from base64 import b64decode
import requests
api_response = requests.post(
"https://api.zyte.com/v1/extract",
auth=("YOUR_API_KEY", ""),
json={
"url": "https://httpbin.org/anything",
"httpResponseBody": True,
"customHttpRequestHeaders": [
{
"name": "Accept-Language",
"value": "fa",
},
],
},
)
http_response_body = b64decode(api_response.json()["httpResponseBody"])
headers = json.loads(http_response_body)["headers"]
import asyncio
import json
from base64 import b64decode
from zyte_api.aio.client import AsyncClient
async def main():
client = AsyncClient()
api_response = await client.request_raw(
{
"url": "https://httpbin.org/anything",
"httpResponseBody": True,
"customHttpRequestHeaders": [
{
"name": "Accept-Language",
"value": "fa",
},
],
}
)
http_response_body: bytes = b64decode(api_response["httpResponseBody"])
headers = json.loads(http_response_body)["headers"]
asyncio.run(main())
import json
from scrapy import Request, Spider
class HTTPBinOrgSpider(Spider):
name = "httpbin_org"
def start_requests(self):
yield Request(
"https://httpbin.org/anything",
headers={"Accept-Language": "fa"},
)
def parse(self, response):
headers = json.loads(response.text)["headers"]
Output (first 5 lines):
{
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "fa",
"Host": "httpbin.org",
Decoding HTML#
HTML extracted as a response body needs to be decoded.
HTML content can be encoded with one of many character encodings, and you must determine the character encoding used so that you can decode that HTML content accordingly.
The best way to determine the encoding of HTML content is to follow the encoding sniffing algorithm defined in the HTML standard.
In addition to the HTML content, the HTML encoding sniffing algorithm takes
into account any character encoding provided in the optional charset
parameter of media types declared in the Content-Type
response header, so
make sure you get the response headers in
addition to the response body if you are following the HTML encoding sniffing
algorithm.
Example
Note
Install and configure code example requirements to run the example below.
Use file to find the media type of a previously-downloaded response based solely on its body (i.e. not following the HTML encoding sniffing algorithm).
file --mime-encoding output.html
Use content-type-parser, html-encoding-sniffer and whatwg-encoding:
const contentTypeParser = require('content-type-parser')
const htmlEncodingSniffer = require('html-encoding-sniffer')
const whatwgEncoding = require('whatwg-encoding')
// …
const httpResponseHeaders = response.data.httpResponseHeaders
let contentTypeCharset
httpResponseHeaders.forEach(function (item) {
if (item.name.toLowerCase() === 'content-type') {
contentTypeCharset = contentTypeParser(item.value).get('charset')
}
})
const httpResponseBody = Buffer.from(response.data.httpResponseBody, 'base64')
const encoding = htmlEncodingSniffer(httpResponseBody, {
transportLayerEncodingLabel: contentTypeCharset
})
const html = whatwgEncoding.decode(httpResponseBody, encoding)
web-poet provides a response wrapper that automatically decodes the response body following an encoding sniffing algorithm similar to the one defined in the HTML standard.
Provided that you have extracted a response with both body and headers, and you have Base64-decoded the response body, you can decode the HTML bytes as follows:
from web_poet import HttpResponse
# …
headers = tuple(
(item['name'], item['value'])
for item in http_response_headers
)
response = HttpResponse(
url='https://example.com',
body=http_response_body,
status=200,
headers=headers,
)
html = response.text
In transparent mode, regular Scrapy requests targeting HTML resources decode them by default. See Zyte API HTTP requests.
HTML and browser HTML#
HTML found in httpResponseBody is usually different from HTML found in browserHtml (browser HTML):
httpResponseBody does not reflect changes that a webpage makes at run time using JavaScript, such as loading content from additional URLs, or moving or reformatting content within the webpage.
browserHtml includes a normalization of the HTML from the underlying HTTP response, which web browsers perform according to the HTML5 specification. So the content of HTML and browser HTML could be different even when there is no JavaScript involved.
Parsing HTML from httpResponseBody with libraries that do not implement HTML5 parsing, such as lxml.html (used by Scrapy by default), results in a different tree structure.
With an HTML5-compatible parser the resulting tree structure would be the same, provided JavaScript does not cause any other difference.
Because of these differences, switching between these HTML inputs can break your existing parsing code and require changes, such as updating XPath or CSS selectors.