Zyte API shared features#
Learn here about Zyte API features that you can use with both HTTP requests and browser automation: geolocation, cookies, redirection, response headers, and metadata.
Geolocation#
Set the geolocation
key in your API request body to a supported ISO 3166-1
alpha-2 country code to
channel your request through an IP address associated with the corresponding
country.
Example
Note
Install and configure code example requirements to run the example below.
using System.Collections.Generic;
using System.Net;
using System.Net.Http;
using System.Text;
using System.Text.Json;
using System.Threading.Tasks;
HttpClientHandler handler = new HttpClientHandler()
{
AutomaticDecompression = DecompressionMethods.All
};
HttpClient client = new HttpClient(handler);
var apiKey = "YOUR_API_KEY";
var bytes = Encoding.GetEncoding("ISO-8859-1").GetBytes(apiKey + ":");
var auth = System.Convert.ToBase64String(bytes);
client.DefaultRequestHeaders.Add("Authorization", "Basic " + auth);
client.DefaultRequestHeaders.Add("Accept-Encoding", "br, gzip, deflate");
var input = new Dictionary<string, object>(){
{"url", "http://ip-api.com/json"},
{"httpResponseBody", true},
{"geolocation", "AU"}
};
var inputJson = JsonSerializer.Serialize(input);
var content = new StringContent(inputJson, Encoding.UTF8, "application/json");
HttpResponseMessage response = await client.PostAsync("https://api.zyte.com/v1/extract", content);
var body = await response.Content.ReadAsByteArrayAsync();
var data = JsonDocument.Parse(body);
var base64HttpResponseBody = data.RootElement.GetProperty("httpResponseBody").ToString();
var httpResponseBody = System.Convert.FromBase64String(base64HttpResponseBody);
var responseData = JsonDocument.Parse(httpResponseBody);
var countryCode = responseData.RootElement.GetProperty("countryCode").ToString();
{"url": "http://ip-api.com/json", "httpResponseBody": true, "geolocation": "AU"}
curl \
--user YOUR_API_KEY: \
--header 'Content-Type: application/json' \
--data @input.json \
--compressed \
https://api.zyte.com/v1/extract \
| jq --raw-output .httpResponseBody \
| base64 --decode \
| jq .countryCode
{"url": "http://ip-api.com/json", "httpResponseBody": true, "geolocation": "AU"}
zyte-api input.jsonl 2> /dev/null \
| jq --raw-output .httpResponseBody \
| base64 --decode \
| jq .countryCode
import com.google.common.collect.ImmutableMap;
import com.google.gson.Gson;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Base64;
import java.util.Map;
import org.apache.hc.client5.http.classic.methods.HttpPost;
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
import org.apache.hc.client5.http.impl.classic.CloseableHttpResponse;
import org.apache.hc.client5.http.impl.classic.HttpClients;
import org.apache.hc.core5.http.ContentType;
import org.apache.hc.core5.http.HttpEntity;
import org.apache.hc.core5.http.HttpHeaders;
import org.apache.hc.core5.http.ParseException;
import org.apache.hc.core5.http.io.entity.EntityUtils;
import org.apache.hc.core5.http.io.entity.StringEntity;
class Example {
private static final String API_KEY = "YOUR_API_KEY";
public static void main(final String[] args)
throws InterruptedException, IOException, ParseException {
Map<String, Object> parameters =
ImmutableMap.of(
"url", "http://ip-api.com/json", "httpResponseBody", true, "geolocation", "AU");
String requestBody = new Gson().toJson(parameters);
HttpPost request = new HttpPost("https://api.zyte.com/v1/extract");
request.setHeader(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON);
request.setHeader(HttpHeaders.ACCEPT_ENCODING, "gzip, deflate");
request.setHeader(HttpHeaders.AUTHORIZATION, buildAuthHeader());
request.setEntity(new StringEntity(requestBody));
try (CloseableHttpClient client = HttpClients.createDefault()) {
try (CloseableHttpResponse response = client.execute(request)) {
HttpEntity entity = response.getEntity();
String apiResponse = EntityUtils.toString(entity, StandardCharsets.UTF_8);
JsonObject jsonObject = JsonParser.parseString(apiResponse).getAsJsonObject();
String base64HttpResponseBody = jsonObject.get("httpResponseBody").getAsString();
byte[] httpResponseBodyBytes = Base64.getDecoder().decode(base64HttpResponseBody);
String httpResponseBody = new String(httpResponseBodyBytes, StandardCharsets.UTF_8);
JsonObject data = JsonParser.parseString(httpResponseBody).getAsJsonObject();
String countryCode = data.get("countryCode").getAsString();
}
}
}
private static String buildAuthHeader() {
String auth = API_KEY + ":";
String encodedAuth = Base64.getEncoder().encodeToString(auth.getBytes());
return "Basic " + encodedAuth;
}
}
const axios = require('axios')
axios.post(
'https://api.zyte.com/v1/extract',
{
url: 'http://ip-api.com/json',
httpResponseBody: true,
geolocation: 'AU'
},
{
auth: { username: 'YOUR_API_KEY' }
}
).then((response) => {
const httpResponseBody = Buffer.from(
response.data.httpResponseBody,
'base64'
)
const data = JSON.parse(httpResponseBody)
const countryCode = data.countryCode
})
<?php
$client = new GuzzleHttp\Client();
$response = $client->request('POST', 'https://api.zyte.com/v1/extract', [
'auth' => ['YOUR_API_KEY', ''],
'headers' => ['Accept-Encoding' => 'gzip'],
'json' => [
'url' => 'http://ip-api.com/json',
'httpResponseBody' => true,
'geolocation' => 'AU',
],
]);
$api = json_decode($response->getBody());
$http_response_body = base64_decode($api->httpResponseBody);
$data = json_decode($http_response_body);
$country_code = $data->countryCode;
import json
from base64 import b64decode
import requests
api_response = requests.post(
'https://api.zyte.com/v1/extract',
auth=('YOUR_API_KEY', ''),
json={
'url': 'http://ip-api.com/json',
'httpResponseBody': True,
'geolocation': 'AU',
},
)
http_response_body: bytes = b64decode(
api_response.json()['httpResponseBody']
)
response_data = json.loads(http_response_body)
country_code = response_data['countryCode']
import asyncio
import json
from base64 import b64decode
from zyte_api.aio.client import AsyncClient
async def main():
client = AsyncClient()
api_response = await client.request_raw(
{
'url': 'http://ip-api.com/json',
'httpResponseBody': True,
'geolocation': 'AU',
}
)
http_response_body: bytes = b64decode(
api_response['httpResponseBody']
)
response_data = json.loads(http_response_body)
country_code = response_data['countryCode']
asyncio.run(main())
import json
from scrapy import Request, Spider
class IPAPIComSpider(Spider):
name = "ip_api_com"
def start_requests(self):
yield Request(
"http://ip-api.com/json",
meta={
"zyte_api_automap": {
"geolocation": "AU",
},
},
)
def parse(self, response):
response_data = json.loads(response.body)
country_code = response_data["countryCode"]
Look up the geolocation
key in the specification
for the list of supported countries.
When the geolocation
key is not specified, Zyte API aims to channel your
request through a country that ensures a good response from the target website,
meaning that the chosen country:
Does not cause unexpected locale changes in the response data, such as the wrong language, currency, date format, time zone, etc.
Does not cause your request to be banned.
Zyte API can use countries of origin beyond those supported by the
geolocation
key. For example, if you access a Turkish website, Zyte API may
access the website from Türkiye as long as you do not specify otherwise through
the geolocation
key, even though geolocation
does not support TR
as
a value at the moment.
Cookies#
Warning
Cookie support is currently experimental. Breaking API changes may be introduced at any moment.
Set the following keys within the experimental
key in your API request body
to handle cookies:
To include cookies in a request set the
requestCookies
key to an array of objects representing those cookies.Each cookie object requires
name
,value
, anddomain
keys. For a complete list of supported keys, see the reference.Set the
responseCookies
key totrue
to get website response cookies included in the matching key of your Zyte API response.responseCookies
also works with browser features. For example, you can send a request to extract browser HTML and response cookies from a URL, and then use those cookies with therequestCookies
key of requests that extract a response body.
Example
Note
Install and configure code example requirements to run the example below.
The following code example sends a cookie to httpbin.org and prints the cookies that httpbin.org reports to have received:
using System;
using System.Collections.Generic;
using System.Net;
using System.Net.Http;
using System.Text;
using System.Text.Json;
using System.Threading.Tasks;
HttpClientHandler handler = new HttpClientHandler()
{
AutomaticDecompression = DecompressionMethods.All
};
HttpClient client = new HttpClient(handler);
var apiKey = "YOUR_API_KEY";
var bytes = Encoding.GetEncoding("ISO-8859-1").GetBytes(apiKey + ":");
var auth = System.Convert.ToBase64String(bytes);
client.DefaultRequestHeaders.Add("Authorization", "Basic " + auth);
client.DefaultRequestHeaders.Add("Accept-Encoding", "br, gzip, deflate");
var input = new Dictionary<string, object>(){
{"url", "https://httpbin.org/cookies"},
{"httpResponseBody", true},
{
"experimental",
new Dictionary<string, object>()
{
{
"requestCookies",
new List<Dictionary<string, string>>()
{
new Dictionary<string, string>()
{
{"name", "foo"},
{"value", "bar"},
{"domain", "httpbin.org"}
}
}
}
}
}
};
var inputJson = JsonSerializer.Serialize(input);
var content = new StringContent(inputJson, Encoding.UTF8, "application/json");
HttpResponseMessage response = await client.PostAsync("https://api.zyte.com/v1/extract", content);
var body = await response.Content.ReadAsByteArrayAsync();
var data = JsonDocument.Parse(body);
var base64HttpResponseBody = data.RootElement.GetProperty("httpResponseBody").ToString();
var httpResponseBody = System.Convert.FromBase64String(base64HttpResponseBody);
var result = System.Text.Encoding.UTF8.GetString(httpResponseBody);
Console.WriteLine(result);
{
"url": "https://httpbin.org/cookies",
"httpResponseBody": true,
"experimental": {
"requestCookies": [
{
"name": "foo",
"value": "bar",
"domain": "httpbin.org"
}
]
}
}
curl \
--user YOUR_API_KEY: \
--header 'Content-Type: application/json' \
--data @input.json \
--compressed \
https://api.zyte.com/v1/extract \
| jq --raw-output .httpResponseBody \
| base64 --decode
{"url":"https://httpbin.org/cookies","httpResponseBody":true,"experimental":{"requestCookies":[{"name":"foo","value":"bar","domain":"httpbin.org"}]}}
zyte-api input.jsonl \
| jq --raw-output .httpResponseBody \
| base64 --decode
import com.google.common.collect.ImmutableMap;
import com.google.gson.Gson;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Base64;
import java.util.Collections;
import java.util.Map;
import org.apache.hc.client5.http.classic.methods.HttpPost;
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
import org.apache.hc.client5.http.impl.classic.CloseableHttpResponse;
import org.apache.hc.client5.http.impl.classic.HttpClients;
import org.apache.hc.core5.http.ContentType;
import org.apache.hc.core5.http.HttpEntity;
import org.apache.hc.core5.http.HttpHeaders;
import org.apache.hc.core5.http.ParseException;
import org.apache.hc.core5.http.io.entity.EntityUtils;
import org.apache.hc.core5.http.io.entity.StringEntity;
class Example {
private static final String API_KEY = "YOUR_API_KEY";
public static void main(final String[] args)
throws InterruptedException, IOException, ParseException {
Map<String, String> cookies =
ImmutableMap.of("name", "foo", "value", "bar", "domain", "httpbin.org");
Map<String, Object> experimental =
ImmutableMap.of("requestCookies", Collections.singletonList(cookies));
Map<String, Object> parameters =
ImmutableMap.of(
"url",
"https://httpbin.org/cookies",
"httpResponseBody",
true,
"experimental",
experimental);
String requestBody = new Gson().toJson(parameters);
HttpPost request = new HttpPost("https://api.zyte.com/v1/extract");
request.setHeader(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON);
request.setHeader(HttpHeaders.ACCEPT_ENCODING, "gzip, deflate");
request.setHeader(HttpHeaders.AUTHORIZATION, buildAuthHeader());
request.setEntity(new StringEntity(requestBody));
try (CloseableHttpClient client = HttpClients.createDefault()) {
try (CloseableHttpResponse response = client.execute(request)) {
HttpEntity entity = response.getEntity();
String apiResponse = EntityUtils.toString(entity, StandardCharsets.UTF_8);
JsonObject jsonObject = JsonParser.parseString(apiResponse).getAsJsonObject();
String base64HttpResponseBody = jsonObject.get("httpResponseBody").getAsString();
byte[] httpResponseBodyBytes = Base64.getDecoder().decode(base64HttpResponseBody);
String httpResponseBody = new String(httpResponseBodyBytes, StandardCharsets.UTF_8);
System.out.println(httpResponseBody);
}
}
}
private static String buildAuthHeader() {
String auth = API_KEY + ":";
String encodedAuth = Base64.getEncoder().encodeToString(auth.getBytes());
return "Basic " + encodedAuth;
}
}
const axios = require('axios')
axios.post(
'https://api.zyte.com/v1/extract',
{
url: 'https://httpbin.org/cookies',
httpResponseBody: true,
experimental: {
requestCookies: [
{
name: 'foo',
value: 'bar',
domain: 'httpbin.org'
}
]
}
},
{
auth: { username: 'YOUR_API_KEY' }
}
).then((response) => {
const httpResponseBody = Buffer.from(
response.data.httpResponseBody,
'base64'
)
console.log(httpResponseBody.toString())
})
<?php
$client = new GuzzleHttp\Client();
$response = $client->request('POST', 'https://api.zyte.com/v1/extract', [
'auth' => ['YOUR_API_KEY', ''],
'headers' => ['Accept-Encoding' => 'gzip'],
'json' => [
'url' => 'https://httpbin.org/cookies',
'httpResponseBody' => true,
'experimental' => [
'requestCookies' => [
[
'name' => 'foo',
'value' => 'bar',
'domain' => 'httpbin.org',
],
],
],
],
]);
$api = json_decode($response->getBody());
$http_response_body = base64_decode($api->httpResponseBody);
echo $http_response_body;
from base64 import b64decode
import requests
api_response = requests.post(
"https://api.zyte.com/v1/extract",
auth=("YOUR_API_KEY", ""),
json={
"url": "https://httpbin.org/cookies",
"httpResponseBody": True,
"experimental": {
"requestCookies": [
{
"name": "foo",
"value": "bar",
"domain": "httpbin.org",
},
],
},
},
)
http_response_body = b64decode(api_response.json()["httpResponseBody"])
print(http_response_body.decode())
import asyncio
from base64 import b64decode
from zyte_api.aio.client import AsyncClient
async def main():
client = AsyncClient()
api_response = await client.request_raw(
{
"url": "https://httpbin.org/cookies",
"httpResponseBody": True,
"experimental": {
"requestCookies": [
{
"name": "foo",
"value": "bar",
"domain": "httpbin.org",
},
],
},
}
)
http_response_body = b64decode(api_response["httpResponseBody"])
print(http_response_body.decode())
asyncio.run(main())
from scrapy import Request, Spider
class QuotesToScrapeComSpider(Spider):
name = "quotes_toscrape_com"
def start_requests(self):
yield Request(
"https://httpbin.org/cookies",
meta={
"zyte_api_automap": {
"experimental": {
"requestCookies": [
{
"name": "foo",
"value": "bar",
"domain": "httpbin.org",
},
],
},
},
},
)
def parse(self, response):
print(response.text)
Output:
{
"cookies": {
"foo": "bar"
}
}
Redirection#
Zyte API always follows HTTP redirections.
On browser requests, redirections triggered by HTML or JavaScript are also followed.
Response headers#
Set the httpResponseHeaders
key in your API request body to true
to
extract response headers.
When you do, the Zyte API response includes an httpResponseHeaders
key with
the headers as an array of objects with name
and value
keys.
Example
Note
Install and configure code example requirements to run the example below.
using System.Collections.Generic;
using System.Net;
using System.Net.Http;
using System.Text;
using System.Text.Json;
using System.Threading.Tasks;
HttpClientHandler handler = new HttpClientHandler()
{
AutomaticDecompression = DecompressionMethods.All
};
HttpClient client = new HttpClient(handler);
var apiKey = "YOUR_API_KEY";
var bytes = Encoding.GetEncoding("ISO-8859-1").GetBytes(apiKey + ":");
var auth = System.Convert.ToBase64String(bytes);
client.DefaultRequestHeaders.Add("Authorization", "Basic " + auth);
client.DefaultRequestHeaders.Add("Accept-Encoding", "br, gzip, deflate");
var input = new Dictionary<string, object>(){
{"url", "https://toscrape.com"},
{"httpResponseHeaders", true}
};
var inputJson = JsonSerializer.Serialize(input);
var content = new StringContent(inputJson, Encoding.UTF8, "application/json");
HttpResponseMessage response = await client.PostAsync("https://api.zyte.com/v1/extract", content);
var body = await response.Content.ReadAsByteArrayAsync();
var data = JsonDocument.Parse(body);
var headerEnumerator = data.RootElement.GetProperty("httpResponseHeaders").EnumerateArray();
var headers = new Dictionary<string, string>();
while (headerEnumerator.MoveNext())
{
headers.Add(
headerEnumerator.Current.GetProperty("name").ToString(),
headerEnumerator.Current.GetProperty("value").ToString()
);
}
{"url": "https://toscrape.com", "httpResponseHeaders": true}
curl \
--user YOUR_API_KEY: \
--header 'Content-Type: application/json' \
--data @input.json \
--compressed \
https://api.zyte.com/v1/extract \
| jq .httpResponseHeaders
{"url": "https://toscrape.com", "httpResponseHeaders": true}
zyte-api input.jsonl 2> /dev/null \
| jq .httpResponseHeaders
import com.google.common.collect.ImmutableMap;
import com.google.gson.Gson;
import com.google.gson.JsonArray;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Base64;
import java.util.Map;
import org.apache.hc.client5.http.classic.methods.HttpPost;
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
import org.apache.hc.client5.http.impl.classic.CloseableHttpResponse;
import org.apache.hc.client5.http.impl.classic.HttpClients;
import org.apache.hc.core5.http.ContentType;
import org.apache.hc.core5.http.HttpEntity;
import org.apache.hc.core5.http.HttpHeaders;
import org.apache.hc.core5.http.ParseException;
import org.apache.hc.core5.http.io.entity.EntityUtils;
import org.apache.hc.core5.http.io.entity.StringEntity;
class Example {
private static final String API_KEY = "YOUR_API_KEY";
public static void main(final String[] args)
throws InterruptedException, IOException, ParseException {
Map<String, Object> parameters =
ImmutableMap.of(
"url", "https://toscrape.com", "browserHtml", true, "httpResponseHeaders", true);
String requestBody = new Gson().toJson(parameters);
HttpPost request = new HttpPost("https://api.zyte.com/v1/extract");
request.setHeader(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON);
request.setHeader(HttpHeaders.ACCEPT_ENCODING, "gzip, deflate");
request.setHeader(HttpHeaders.AUTHORIZATION, buildAuthHeader());
request.setEntity(new StringEntity(requestBody));
try (CloseableHttpClient client = HttpClients.createDefault()) {
try (CloseableHttpResponse response = client.execute(request)) {
HttpEntity entity = response.getEntity();
String apiResponse = EntityUtils.toString(entity, StandardCharsets.UTF_8);
JsonObject jsonObject = JsonParser.parseString(apiResponse).getAsJsonObject();
JsonArray httpResponseHeaders = jsonObject.get("httpResponseHeaders").getAsJsonArray();
}
}
}
private static String buildAuthHeader() {
String auth = API_KEY + ":";
String encodedAuth = Base64.getEncoder().encodeToString(auth.getBytes());
return "Basic " + encodedAuth;
}
}
const axios = require('axios')
axios.post(
'https://api.zyte.com/v1/extract',
{
url: 'https://toscrape.com',
httpResponseHeaders: true
},
{
auth: { username: 'YOUR_API_KEY' }
}
).then((response) => {
const httpResponseHeaders = response.data.httpResponseHeaders
})
<?php
$client = new GuzzleHttp\Client();
$response = $client->request('POST', 'https://api.zyte.com/v1/extract', [
'auth' => ['YOUR_API_KEY', ''],
'headers' => ['Accept-Encoding' => 'gzip'],
'json' => [
'url' => 'https://toscrape.com',
'httpResponseHeaders' => true,
],
]);
$api = json_decode($response->getBody());
$http_response_headers = $api->httpResponseHeaders;
import requests
api_response = requests.post(
'https://api.zyte.com/v1/extract',
auth=('YOUR_API_KEY', ''),
json={
'url': 'https://toscrape.com',
'httpResponseHeaders': True,
},
)
http_response_headers = api_response.json()['httpResponseHeaders']
import asyncio
from zyte_api.aio.client import AsyncClient
async def main():
client = AsyncClient()
api_response = await client.request_raw(
{
'url': 'https://toscrape.com',
'httpResponseHeaders': True,
}
)
http_response_headers = api_response['httpResponseHeaders']
asyncio.run(main())
from scrapy import Request, Spider
class ToScrapeComSpider(Spider):
name = "toscrape_com"
def start_requests(self):
yield Request(
"https://toscrape.com",
meta={
"zyte_api_automap": {
"httpResponseBody": False,
"httpResponseHeaders": True,
},
},
)
def parse(self, response):
headers = response.headers
Note
In transparent mode, httpResponseHeaders
is sent by
default for httpResponseBody requests,
but sending it explicitly is still recommended, as future
versions of scrapy-zyte-api may stop sending it by default.
Metadata#
Set the echoData
key in your API request body to an arbitrary value, to get
that value verbatim in the API response.
When sending multiple requests in parallel, this can be useful, for example, to keep track of the original request order.
Example
Note
Install and configure code example requirements to run the example below.
using System.Collections.Generic;
using System.Linq;
using System.Net;
using System.Net.Http;
using System.Text;
using System.Text.Json;
using System.Threading.Tasks;
var inputData = new List<List<object>>()
{
new List<object>(){"https://toscrape.com", 1},
new List<object>(){"https://books.toscrape.com", 2},
new List<object>(){"https://quotes.toscrape.com", 3},
};
var output = new List<HttpResponseMessage>();
var handler = new HttpClientHandler()
{
AutomaticDecompression = DecompressionMethods.All,
MaxConnectionsPerServer = 15
};
var client = new HttpClient(handler);
var apiKey = "YOUR_API_KEY";
var bytes = Encoding.GetEncoding("ISO-8859-1").GetBytes(apiKey + ":");
var auth = System.Convert.ToBase64String(bytes);
client.DefaultRequestHeaders.Add("Authorization", "Basic " + auth);
client.DefaultRequestHeaders.Add("Accept-Encoding", "br, gzip, deflate");
var responseTasks = new List<Task<HttpResponseMessage>>();
foreach (var entry in inputData)
{
var input = new Dictionary<string, object>(){
{"url", entry[0]},
{"browserHtml", true},
{"echoData", entry[1]}
};
var inputJson = JsonSerializer.Serialize(input);
var content = new StringContent(inputJson, Encoding.UTF8, "application/json");
var responseTask = client.PostAsync("https://api.zyte.com/v1/extract", content);
responseTasks.Add(responseTask);
}
while (responseTasks.Any())
{
var responseTask = await Task.WhenAny(responseTasks);
responseTasks.Remove(responseTask);
var response = await responseTask;
output.Add(response);
}
{"url": "https://toscrape.com", "browserHtml": true, "echoData": 1}
{"url": "https://books.toscrape.com", "browserHtml": true, "echoData": 2}
{"url": "https://quotes.toscrape.com", "browserHtml": true, "echoData": 3}
cat input.jsonl \
| xargs -P 15 -d\\n -n 1 \
bash -c "
curl \
--user $ZYTE_API_KEY: \
--header 'Content-Type: application/json' \
--data \"\$0\" \
--compressed \
https://api.zyte.com/v1/extract \
| jq .echoData \
| awk '{print \$1}' \
>> output.jsonl
"
{"url": "https://toscrape.com", "browserHtml": true, "echoData": 1}
{"url": "https://books.toscrape.com", "browserHtml": true, "echoData": 2}
{"url": "https://quotes.toscrape.com", "browserHtml": true, "echoData": 3}
zyte-api --n-conn 15 input.jsonl -o output.jsonl
import com.google.common.collect.ImmutableMap;
import com.google.gson.Gson;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Base64;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Future;
import org.apache.hc.client5.http.async.methods.SimpleHttpRequest;
import org.apache.hc.client5.http.async.methods.SimpleHttpResponse;
import org.apache.hc.client5.http.impl.async.CloseableHttpAsyncClient;
import org.apache.hc.client5.http.impl.async.HttpAsyncClients;
import org.apache.hc.client5.http.impl.nio.PoolingAsyncClientConnectionManager;
import org.apache.hc.client5.http.impl.nio.PoolingAsyncClientConnectionManagerBuilder;
import org.apache.hc.client5.http.ssl.ClientTlsStrategyBuilder;
import org.apache.hc.core5.concurrent.FutureCallback;
import org.apache.hc.core5.http.ContentType;
import org.apache.hc.core5.http.HttpHeaders;
import org.apache.hc.core5.http.ParseException;
import org.apache.hc.core5.http.nio.ssl.TlsStrategy;
import org.apache.hc.core5.reactor.ssl.TlsDetails;
class Example {
private static final String API_KEY = "YOUR_API_KEY";
public static void main(final String[] args)
throws ExecutionException, InterruptedException, IOException, ParseException {
Object[][] input = {
{"https://toscrape.com", 1},
{"https://bookstoscrape.com", 2},
{"https://quotes.toscrape.com", 3}
};
List<Future> futures = new ArrayList<Future>();
List<String> output = new ArrayList<String>();
int concurrency = 15;
// https://issues.apache.org/jira/browse/HTTPCLIENT-2219
final TlsStrategy tlsStrategy =
ClientTlsStrategyBuilder.create()
.useSystemProperties()
.setTlsDetailsFactory(
sslEngine ->
new TlsDetails(sslEngine.getSession(), sslEngine.getApplicationProtocol()))
.build();
PoolingAsyncClientConnectionManager connectionManager =
PoolingAsyncClientConnectionManagerBuilder.create().setTlsStrategy(tlsStrategy).build();
connectionManager.setMaxTotal(concurrency);
connectionManager.setDefaultMaxPerRoute(concurrency);
CloseableHttpAsyncClient client =
HttpAsyncClients.custom().setConnectionManager(connectionManager).build();
try {
client.start();
for (int i = 0; i < input.length; i++) {
Map<String, Object> parameters =
ImmutableMap.of("url", input[i][0], "browserHtml", true, "echoData", input[i][1]);
String requestBody = new Gson().toJson(parameters);
SimpleHttpRequest request =
new SimpleHttpRequest("POST", "https://api.zyte.com/v1/extract");
request.setHeader(HttpHeaders.AUTHORIZATION, buildAuthHeader());
request.setHeader(HttpHeaders.ACCEPT_ENCODING, "gzip, deflate");
request.setBody(requestBody, ContentType.APPLICATION_JSON);
final Future<SimpleHttpResponse> future =
client.execute(
request,
new FutureCallback<SimpleHttpResponse>() {
public void completed(final SimpleHttpResponse response) {
String apiResponse = response.getBodyText();
output.add(apiResponse);
}
public void failed(final Exception ex) {}
public void cancelled() {}
});
futures.add(future);
}
for (int i = 0; i < futures.size(); i++) {
futures.get(i).get();
}
} finally {
client.close();
}
}
private static String buildAuthHeader() {
String auth = API_KEY + ":";
String encodedAuth = Base64.getEncoder().encodeToString(auth.getBytes());
return "Basic " + encodedAuth;
}
}
const { ConcurrencyManager } = require('axios-concurrency')
const axios = require('axios')
const urls = [
['https://toscrape.com', 1],
['https://books.toscrape.com', 2],
['https://quotes.toscrape.com', 3]
]
const output = []
const client = axios.create()
ConcurrencyManager(client, 15)
Promise.all(
urls.map((input) =>
client.post(
'https://api.zyte.com/v1/extract',
{ url: input[0], browserHtml: true, echoData: input[1] },
{
auth: { username: 'YOUR_API_KEY' }
}
).then((response) => output.push(response.data))
)
)
<?php
$input = [
['https://toscrape.com', 1],
['https://books.toscrape.com', 2],
['https://quotes.toscrape.com', 3],
];
$output = [];
$promises = [];
$client = new GuzzleHttp\Client();
foreach ($input as $url_and_index) {
$options = [
'auth' => ['YOUR_API_KEY', ''],
'headers' => ['Accept-Encoding' => 'gzip'],
'json' => [
'url' => $url_and_index[0],
'browserHtml' => true,
'echoData' => $url_and_index[1],
],
];
$request = new \GuzzleHttp\Psr7\Request('POST', 'https://api.zyte.com/v1/extract');
global $promises;
$promises[] = $client->sendAsync($request, $options)->then(function ($response) {
global $output;
$output[] = json_decode($response->getBody());
});
}
foreach ($promises as $promise) {
$promise->wait();
}
import asyncio
import aiohttp
input_data = [
('https://toscrape.com', 1),
('https://books.toscrape.com', 2),
('https://quotes.toscrape.com', 3),
]
output = []
async def extract(client, url, index):
response = await client.post(
'https://api.zyte.com/v1/extract',
json={'url': url, 'browserHtml': True, 'echoData': index},
auth=aiohttp.BasicAuth('YOUR_API_KEY'),
)
output.append(await response.json())
async def main():
connector = aiohttp.TCPConnector(limit_per_host=15)
async with aiohttp.ClientSession(connector=connector) as client:
await asyncio.gather(
*[extract(client, url, index) for url, index in input_data]
)
asyncio.run(main())
import asyncio
from zyte_api.aio.client import AsyncClient, create_session
input_data = [
('https://toscrape.com', 1),
('https://books.toscrape.com', 2),
('https://quotes.toscrape.com', 3),
]
output = []
async def main():
connection_count = 15
client = AsyncClient(n_conn=connection_count)
requests = [
{'url': url, 'browserHtml': True, 'echoData': index}
for url, index in input_data
]
async with create_session(connection_count) as session:
responses = client.request_parallel_as_completed(
requests,
session=session,
)
for response in responses:
output.append(await response)
asyncio.run(main())
from scrapy import Request, Spider
input_data = [
("https://toscrape.com", 1),
("https://books.toscrape.com", 2),
("https://quotes.toscrape.com", 3),
]
class ToScrapeSpider(Spider):
name = "toscrape_com"
custom_settings = {
"CONCURRENT_REQUESTS": 15,
"CONCURRENT_REQUESTS_PER_DOMAIN": 15,
}
def start_requests(self):
for url, index in input_data:
yield Request(
url,
meta={
"zyte_api_automap": {
"browserHtml": True,
"echoData": index,
},
},
)
def parse(self, response):
yield {
"index": response.raw_api_response["echoData"],
"html": response.text,
}
Alternatively, you can use Scrapy’s Request.cb_kwargs
directly for a
similar purpose:
def start_requests(self):
for url, index in input_data:
yield Request(
url,
cb_kwargs={"index": index},
meta={
"zyte_api_automap": {
"browserHtml": True,
},
},
)
def parse(self, response, index):
yield {
"index": index,
"html": response.text,
}
There is another metadata field that you can set and get verbatim on the API
response: jobId
. When running your requests from a Zyte Scrapy Cloud job,
this field is meant to indicate the corresponding job ID. scrapy-zyte-api
fills this field automatically.