Zyte API HTTP requests#
To send HTTP requests through Zyte API, without browser rendering, set the httpResponseBody request field to
true
, and read the Base64-encoded response body from the
httpResponseBody response field.
Example
Note
Install and configure code example requirements and the Zyte CA certificate to run the example below.
using System.Collections.Generic;
using System.Net;
using System.Net.Http;
using System.Text;
using System.Text.Json;
using System.Threading.Tasks;
HttpClientHandler handler = new HttpClientHandler()
{
AutomaticDecompression = DecompressionMethods.All
};
HttpClient client = new HttpClient(handler);
var apiKey = "YOUR_API_KEY";
var bytes = Encoding.GetEncoding("ISO-8859-1").GetBytes(apiKey + ":");
var auth = System.Convert.ToBase64String(bytes);
client.DefaultRequestHeaders.Add("Authorization", "Basic " + auth);
client.DefaultRequestHeaders.Add("Accept-Encoding", "br, gzip, deflate");
var input = new Dictionary<string, object>(){
{"url", "https://toscrape.com"},
{"httpResponseBody", true}
};
var inputJson = JsonSerializer.Serialize(input);
var content = new StringContent(inputJson, Encoding.UTF8, "application/json");
HttpResponseMessage response = await client.PostAsync("https://api.zyte.com/v1/extract", content);
var body = await response.Content.ReadAsByteArrayAsync();
var data = JsonDocument.Parse(body);
var base64HttpResponseBody = data.RootElement.GetProperty("httpResponseBody").ToString();
var httpResponseBody = System.Convert.FromBase64String(base64HttpResponseBody);
{"url": "https://toscrape.com", "httpResponseBody": true}
zyte-api input.jsonl \
| jq --raw-output .httpResponseBody \
| base64 --decode \
> output.html
{
"url": "https://toscrape.com",
"httpResponseBody": true
}
curl \
--user YOUR_API_KEY: \
--header 'Content-Type: application/json' \
--data @input.json \
--compressed \
https://api.zyte.com/v1/extract \
| jq --raw-output .httpResponseBody \
| base64 --decode \
> output.html
import com.google.common.collect.ImmutableMap;
import com.google.gson.Gson;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Base64;
import java.util.Map;
import org.apache.hc.client5.http.classic.methods.HttpPost;
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
import org.apache.hc.client5.http.impl.classic.HttpClients;
import org.apache.hc.core5.http.ContentType;
import org.apache.hc.core5.http.HttpEntity;
import org.apache.hc.core5.http.HttpHeaders;
import org.apache.hc.core5.http.ParseException;
import org.apache.hc.core5.http.io.entity.EntityUtils;
import org.apache.hc.core5.http.io.entity.StringEntity;
class Example {
private static final String API_KEY = "YOUR_API_KEY";
public static void main(final String[] args)
throws InterruptedException, IOException, ParseException {
Map<String, Object> parameters =
ImmutableMap.of("url", "https://toscrape.com", "httpResponseBody", true);
String requestBody = new Gson().toJson(parameters);
HttpPost request = new HttpPost("https://api.zyte.com/v1/extract");
request.setHeader(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON);
request.setHeader(HttpHeaders.ACCEPT_ENCODING, "gzip, deflate");
request.setHeader(HttpHeaders.AUTHORIZATION, buildAuthHeader());
request.setEntity(new StringEntity(requestBody));
CloseableHttpClient client = HttpClients.createDefault();
client.execute(
request,
response -> {
HttpEntity entity = response.getEntity();
String apiResponse = EntityUtils.toString(entity, StandardCharsets.UTF_8);
JsonObject jsonObject = JsonParser.parseString(apiResponse).getAsJsonObject();
String base64HttpResponseBody = jsonObject.get("httpResponseBody").getAsString();
byte[] httpResponseBodyBytes = Base64.getDecoder().decode(base64HttpResponseBody);
String httpResponseBody = new String(httpResponseBodyBytes, StandardCharsets.UTF_8);
System.out.println(httpResponseBody);
return null;
});
}
private static String buildAuthHeader() {
String auth = API_KEY + ":";
String encodedAuth = Base64.getEncoder().encodeToString(auth.getBytes());
return "Basic " + encodedAuth;
}
}
const axios = require('axios')
axios.post(
'https://api.zyte.com/v1/extract',
{
url: 'https://toscrape.com',
httpResponseBody: true
},
{
auth: { username: 'YOUR_API_KEY' }
}
).then((response) => {
const httpResponseBody = Buffer.from(
response.data.httpResponseBody,
'base64'
)
})
<?php
$client = new GuzzleHttp\Client();
$response = $client->request('POST', 'https://api.zyte.com/v1/extract', [
'auth' => ['YOUR_API_KEY', ''],
'headers' => ['Accept-Encoding' => 'gzip'],
'json' => [
'url' => 'https://toscrape.com',
'httpResponseBody' => true,
],
]);
$data = json_decode($response->getBody());
$http_response_body = base64_decode($data->httpResponseBody);
With the proxy mode, you always get a response body.
curl \
--proxy api.zyte.com:8011 \
--proxy-user YOUR_API_KEY: \
--compressed \
https://toscrape.com \
> output.html
from base64 import b64decode
import requests
api_response = requests.post(
"https://api.zyte.com/v1/extract",
auth=("YOUR_API_KEY", ""),
json={
"url": "https://toscrape.com",
"httpResponseBody": True,
},
)
http_response_body: bytes = b64decode(api_response.json()["httpResponseBody"])
import asyncio
from base64 import b64decode
from zyte_api import AsyncZyteAPI
async def main():
client = AsyncZyteAPI()
api_response = await client.get(
{
"url": "https://toscrape.com",
"httpResponseBody": True,
}
)
http_response_body = b64decode(api_response["httpResponseBody"]).decode()
print(http_response_body)
asyncio.run(main())
In transparent mode, when you target a text resource (e.g. HTML, JSON), regular Scrapy requests work out of the box:
from scrapy import Spider
class ToScrapeSpider(Spider):
name = "toscrape_com"
start_urls = ["https://toscrape.com"]
def parse(self, response):
http_response_text: str = response.text
While regular Scrapy requests also work for binary responses at the moment, they may stop working in future versions of scrapy-zyte-api, so passing httpResponseBody is recommended when targeting binary resources:
from scrapy import Request, Spider
class ToScrapeSpider(Spider):
name = "toscrape_com"
def start_requests(self):
yield Request(
"https://toscrape.com",
meta={
"zyte_api_automap": {
"httpResponseBody": True,
},
},
)
def parse(self, response):
http_response_body: bytes = response.body
Output (first 5 lines):
<!DOCTYPE html>
<html lang="en">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<title>Scraping Sandbox</title>
For HTTP requests, Zyte API also supports:
Geolocation, IP type, cookies, sessions, response headers, and metadata.
Tip
HTTP responses do not reflect HTML content rendered by a web browser that executes JavaScript code. To get browser HTML, use a browser request. See also HTML and browser HTML.
Request method#
HTTP requests use the GET
HTTP method by default. Use the
httpRequestMethod field to set a different HTTP method.
Tip
When using POST
, PUT
or similar, you probably want to also
set a request body.
Example
Note
Install and configure code example requirements and the Zyte CA certificate to run the example below.
using System.Collections.Generic;
using System.Net;
using System.Net.Http;
using System.Text;
using System.Text.Json;
using System.Threading.Tasks;
HttpClientHandler handler = new HttpClientHandler()
{
AutomaticDecompression = DecompressionMethods.All
};
HttpClient client = new HttpClient(handler);
var apiKey = "YOUR_API_KEY";
var bytes = Encoding.GetEncoding("ISO-8859-1").GetBytes(apiKey + ":");
var auth = System.Convert.ToBase64String(bytes);
client.DefaultRequestHeaders.Add("Authorization", "Basic " + auth);
client.DefaultRequestHeaders.Add("Accept-Encoding", "br, gzip, deflate");
var input = new Dictionary<string, object>(){
{"url", "https://httpbin.org/anything"},
{"httpResponseBody", true},
{"httpRequestMethod", "POST"}
};
var inputJson = JsonSerializer.Serialize(input);
var content = new StringContent(inputJson, Encoding.UTF8, "application/json");
HttpResponseMessage response = await client.PostAsync("https://api.zyte.com/v1/extract", content);
var body = await response.Content.ReadAsByteArrayAsync();
var data = JsonDocument.Parse(body);
var base64HttpResponseBody = data.RootElement.GetProperty("httpResponseBody").ToString();
var httpResponseBody = System.Convert.FromBase64String(base64HttpResponseBody);
var responseData = JsonDocument.Parse(httpResponseBody);
var method = responseData.RootElement.GetProperty("method").ToString();
{"url": "https://httpbin.org/anything", "httpResponseBody": true, "httpRequestMethod": "POST"}
zyte-api input.jsonl \
| jq --raw-output .httpResponseBody \
| base64 --decode \
| jq .method
{
"url": "https://httpbin.org/anything",
"httpResponseBody": true,
"httpRequestMethod": "POST"
}
curl \
--user YOUR_API_KEY: \
--header 'Content-Type: application/json' \
--data @input.json \
--compressed \
https://api.zyte.com/v1/extract \
| jq --raw-output .httpResponseBody \
| base64 --decode \
| jq .method
import com.google.common.collect.ImmutableMap;
import com.google.gson.Gson;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Base64;
import java.util.Map;
import org.apache.hc.client5.http.classic.methods.HttpPost;
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
import org.apache.hc.client5.http.impl.classic.HttpClients;
import org.apache.hc.core5.http.ContentType;
import org.apache.hc.core5.http.HttpEntity;
import org.apache.hc.core5.http.HttpHeaders;
import org.apache.hc.core5.http.ParseException;
import org.apache.hc.core5.http.io.entity.EntityUtils;
import org.apache.hc.core5.http.io.entity.StringEntity;
class Example {
private static final String API_KEY = "YOUR_API_KEY";
public static void main(final String[] args)
throws InterruptedException, IOException, ParseException {
Map<String, Object> parameters =
ImmutableMap.of(
"url",
"https://httpbin.org/anything",
"httpResponseBody",
true,
"httpRequestMethod",
"POST");
String requestBody = new Gson().toJson(parameters);
HttpPost request = new HttpPost("https://api.zyte.com/v1/extract");
request.setHeader(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON);
request.setHeader(HttpHeaders.ACCEPT_ENCODING, "gzip, deflate");
request.setHeader(HttpHeaders.AUTHORIZATION, buildAuthHeader());
request.setEntity(new StringEntity(requestBody));
CloseableHttpClient client = HttpClients.createDefault();
client.execute(
request,
response -> {
HttpEntity entity = response.getEntity();
String apiResponse = EntityUtils.toString(entity, StandardCharsets.UTF_8);
JsonObject jsonObject = JsonParser.parseString(apiResponse).getAsJsonObject();
String base64HttpResponseBody = jsonObject.get("httpResponseBody").getAsString();
byte[] httpResponseBodyBytes = Base64.getDecoder().decode(base64HttpResponseBody);
String httpResponseBody = new String(httpResponseBodyBytes, StandardCharsets.UTF_8);
JsonObject data = JsonParser.parseString(httpResponseBody).getAsJsonObject();
String method = data.get("method").getAsString();
System.out.println(method);
return null;
});
}
private static String buildAuthHeader() {
String auth = API_KEY + ":";
String encodedAuth = Base64.getEncoder().encodeToString(auth.getBytes());
return "Basic " + encodedAuth;
}
}
const axios = require('axios')
axios.post(
'https://api.zyte.com/v1/extract',
{
url: 'https://httpbin.org/anything',
httpResponseBody: true,
httpRequestMethod: 'POST'
},
{
auth: { username: 'YOUR_API_KEY' }
}
).then((response) => {
const httpResponseBody = Buffer.from(
response.data.httpResponseBody,
'base64'
)
const method = JSON.parse(httpResponseBody).method
})
<?php
$client = new GuzzleHttp\Client();
$response = $client->request('POST', 'https://api.zyte.com/v1/extract', [
'auth' => ['YOUR_API_KEY', ''],
'headers' => ['Accept-Encoding' => 'gzip'],
'json' => [
'url' => 'https://httpbin.org/anything',
'httpResponseBody' => true,
'httpRequestMethod' => 'POST',
],
]);
$data = json_decode($response->getBody());
$http_response_body = base64_decode($data->httpResponseBody);
$method = json_decode($http_response_body)->method;
With the proxy mode, the request method from your requests is used automatically.
curl \
--proxy api.zyte.com:8011 \
--proxy-user YOUR_API_KEY: \
--compressed \
-X POST \
https://httpbin.org/anything \
| jq .method
import json
from base64 import b64decode
import requests
api_response = requests.post(
"https://api.zyte.com/v1/extract",
auth=("YOUR_API_KEY", ""),
json={
"url": "https://httpbin.org/anything",
"httpResponseBody": True,
"httpRequestMethod": "POST",
},
)
http_response_body = b64decode(api_response.json()["httpResponseBody"])
method = json.loads(http_response_body)["method"]
import asyncio
import json
from base64 import b64decode
from zyte_api import AsyncZyteAPI
async def main():
client = AsyncZyteAPI()
api_response = await client.get(
{
"url": "https://httpbin.org/anything",
"httpResponseBody": True,
"httpRequestMethod": "POST",
}
)
http_response_body: bytes = b64decode(api_response["httpResponseBody"])
method = json.loads(http_response_body)["method"]
print(method)
asyncio.run(main())
import json
from scrapy import Request, Spider
class HTTPBinOrgSpider(Spider):
name = "httpbin_org"
def start_requests(self):
yield Request(
"https://httpbin.org/anything",
method="POST",
)
def parse(self, response):
method = json.loads(response.text)["method"]
Output:
"POST"
Request body#
To include a body in your request, use one of the following fields:
httpRequestText, for UTF-8-encoded text.
httpRequestBody, for anything else. It supports binary data as well, so the value must be Base64-encoded.
httpRequestText
example
Note
Install and configure code example requirements and the Zyte CA certificate to run the example below.
using System;
using System.Collections.Generic;
using System.Net;
using System.Net.Http;
using System.Text;
using System.Text.Json;
using System.Threading.Tasks;
HttpClientHandler handler = new HttpClientHandler()
{
AutomaticDecompression = DecompressionMethods.All
};
HttpClient client = new HttpClient(handler);
var apiKey = "YOUR_API_KEY";
var bytes = Encoding.GetEncoding("ISO-8859-1").GetBytes(apiKey + ":");
var auth = System.Convert.ToBase64String(bytes);
client.DefaultRequestHeaders.Add("Authorization", "Basic " + auth);
client.DefaultRequestHeaders.Add("Accept-Encoding", "br, gzip, deflate");
var input = new Dictionary<string, object>(){
{"url", "https://httpbin.org/anything"},
{"httpResponseBody", true},
{"httpRequestMethod", "POST"},
{"httpRequestText", "{\"foo\": \"bar\"}"}
};
var inputJson = JsonSerializer.Serialize(input);
var content = new StringContent(inputJson, Encoding.UTF8, "application/json");
HttpResponseMessage response = await client.PostAsync("https://api.zyte.com/v1/extract", content);
var body = await response.Content.ReadAsByteArrayAsync();
var data = JsonDocument.Parse(body);
var base64HttpResponseBody = data.RootElement.GetProperty("httpResponseBody").ToString();
var httpResponseBody = System.Convert.FromBase64String(base64HttpResponseBody);
var responseData = JsonDocument.Parse(httpResponseBody);
var requestBody = responseData.RootElement.GetProperty("data").ToString();
Console.WriteLine(requestBody);
{"url": "https://httpbin.org/anything", "httpResponseBody": true, "httpRequestMethod": "POST", "httpRequestText": "{\"foo\": \"bar\"}"}
zyte-api input.jsonl \
| jq --raw-output .httpResponseBody \
| base64 --decode \
| jq --raw-output .data
{
"url": "https://httpbin.org/anything",
"httpResponseBody": true,
"httpRequestMethod": "POST",
"httpRequestText": "{\"foo\": \"bar\"}"
}
curl \
--user YOUR_API_KEY: \
--header 'Content-Type: application/json' \
--data @input.json \
--compressed \
https://api.zyte.com/v1/extract \
| jq --raw-output .httpResponseBody \
| base64 --decode \
| jq --raw-output .data
import com.google.common.collect.ImmutableMap;
import com.google.gson.Gson;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Base64;
import java.util.Map;
import org.apache.hc.client5.http.classic.methods.HttpPost;
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
import org.apache.hc.client5.http.impl.classic.HttpClients;
import org.apache.hc.core5.http.ContentType;
import org.apache.hc.core5.http.HttpEntity;
import org.apache.hc.core5.http.HttpHeaders;
import org.apache.hc.core5.http.ParseException;
import org.apache.hc.core5.http.io.entity.EntityUtils;
import org.apache.hc.core5.http.io.entity.StringEntity;
class Example {
private static final String API_KEY = "YOUR_API_KEY";
public static void main(final String[] args)
throws InterruptedException, IOException, ParseException {
Map<String, Object> parameters =
ImmutableMap.of(
"url",
"https://httpbin.org/anything",
"httpResponseBody",
true,
"httpRequestMethod",
"POST",
"httpRequestText",
"{\"foo\": \"bar\"}");
String requestBody = new Gson().toJson(parameters);
HttpPost request = new HttpPost("https://api.zyte.com/v1/extract");
request.setHeader(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON);
request.setHeader(HttpHeaders.ACCEPT_ENCODING, "gzip, deflate");
request.setHeader(HttpHeaders.AUTHORIZATION, buildAuthHeader());
request.setEntity(new StringEntity(requestBody));
CloseableHttpClient client = HttpClients.createDefault();
client.execute(
request,
response -> {
HttpEntity entity = response.getEntity();
String apiResponse = EntityUtils.toString(entity, StandardCharsets.UTF_8);
JsonObject jsonObject = JsonParser.parseString(apiResponse).getAsJsonObject();
String base64HttpResponseBody = jsonObject.get("httpResponseBody").getAsString();
byte[] httpResponseBodyBytes = Base64.getDecoder().decode(base64HttpResponseBody);
String httpResponseBody = new String(httpResponseBodyBytes, StandardCharsets.UTF_8);
JsonObject data = JsonParser.parseString(httpResponseBody).getAsJsonObject();
String body = data.get("data").getAsString();
System.out.println(body);
return null;
});
}
private static String buildAuthHeader() {
String auth = API_KEY + ":";
String encodedAuth = Base64.getEncoder().encodeToString(auth.getBytes());
return "Basic " + encodedAuth;
}
}
const axios = require('axios')
axios.post(
'https://api.zyte.com/v1/extract',
{
url: 'https://httpbin.org/anything',
httpResponseBody: true,
httpRequestMethod: 'POST',
httpRequestText: '{"foo": "bar"}'
},
{
auth: { username: 'YOUR_API_KEY' }
}
).then((response) => {
const httpResponseBody = Buffer.from(
response.data.httpResponseBody,
'base64'
)
const body = JSON.parse(httpResponseBody).data
console.log(body)
})
<?php
$client = new GuzzleHttp\Client();
$response = $client->request('POST', 'https://api.zyte.com/v1/extract', [
'auth' => ['YOUR_API_KEY', ''],
'headers' => ['Accept-Encoding' => 'gzip'],
'json' => [
'url' => 'https://httpbin.org/anything',
'httpResponseBody' => true,
'httpRequestMethod' => 'POST',
'httpRequestText' => '{"foo": "bar"}',
],
]);
$data = json_decode($response->getBody());
$http_response_body = base64_decode($data->httpResponseBody);
$body = json_decode($http_response_body)->data;
echo $body.PHP_EOL;
With the proxy mode, the request body from your requests is used automatically, be it plain text or binary.
curl \
--proxy api.zyte.com:8011 \
--proxy-user YOUR_API_KEY: \
--compressed \
-X POST \
-H "Content-Type: application/json" \
--data '{"foo": "bar"}' \
https://httpbin.org/anything \
| jq .data
import json
from base64 import b64decode
import requests
api_response = requests.post(
"https://api.zyte.com/v1/extract",
auth=("YOUR_API_KEY", ""),
json={
"url": "https://httpbin.org/anything",
"httpResponseBody": True,
"httpRequestMethod": "POST",
"httpRequestText": '{"foo": "bar"}',
},
)
http_response_body = b64decode(api_response.json()["httpResponseBody"])
body: str = json.loads(http_response_body)["data"]
print(body)
import asyncio
import json
from base64 import b64decode
from zyte_api import AsyncZyteAPI
async def main():
client = AsyncZyteAPI()
api_response = await client.get(
{
"url": "https://httpbin.org/anything",
"httpResponseBody": True,
"httpRequestMethod": "POST",
"httpRequestText": '{"foo": "bar"}',
}
)
http_response_body = b64decode(api_response["httpResponseBody"])
body = json.loads(http_response_body)["data"]
print(body)
asyncio.run(main())
import json
from scrapy import Request, Spider
class HTTPBinOrgSpider(Spider):
name = "httpbin_org"
def start_requests(self):
yield Request(
"https://httpbin.org/anything",
method="POST",
body='{"foo": "bar"}',
)
def parse(self, response):
body = json.loads(response.body)["data"]
print(body)
Output:
{"foo": "bar"}
httpRequestBody
example
Note
Install and configure code example requirements and the Zyte CA certificate to run the example below.
using System;
using System.Collections.Generic;
using System.Net;
using System.Net.Http;
using System.Text;
using System.Text.Json;
using System.Threading.Tasks;
HttpClientHandler handler = new HttpClientHandler()
{
AutomaticDecompression = DecompressionMethods.All
};
HttpClient client = new HttpClient(handler);
var apiKey = "YOUR_API_KEY";
var bytes = Encoding.GetEncoding("ISO-8859-1").GetBytes(apiKey + ":");
var auth = System.Convert.ToBase64String(bytes);
client.DefaultRequestHeaders.Add("Authorization", "Basic " + auth);
client.DefaultRequestHeaders.Add("Accept-Encoding", "br, gzip, deflate");
var input = new Dictionary<string, object>(){
{"url", "https://httpbin.org/anything"},
{"httpResponseBody", true},
{"httpRequestMethod", "POST"},
{"httpRequestBody", "Zm9v"}
};
var inputJson = JsonSerializer.Serialize(input);
var content = new StringContent(inputJson, Encoding.UTF8, "application/json");
HttpResponseMessage response = await client.PostAsync("https://api.zyte.com/v1/extract", content);
var body = await response.Content.ReadAsByteArrayAsync();
var data = JsonDocument.Parse(body);
var base64HttpResponseBody = data.RootElement.GetProperty("httpResponseBody").ToString();
var httpResponseBody = System.Convert.FromBase64String(base64HttpResponseBody);
var responseData = JsonDocument.Parse(httpResponseBody);
var requestBody = responseData.RootElement.GetProperty("data").ToString();
Console.WriteLine(requestBody);
{"url": "https://httpbin.org/anything", "httpResponseBody": true, "httpRequestMethod": "POST", "httpRequestBody": "Zm9v"}
zyte-api input.jsonl \
| jq --raw-output .httpResponseBody \
| base64 --decode \
| jq --raw-output .data
{
"url": "https://httpbin.org/anything",
"httpResponseBody": true,
"httpRequestMethod": "POST",
"httpRequestBody": "Zm9v"
}
curl \
--user YOUR_API_KEY: \
--header 'Content-Type: application/json' \
--data @input.json \
--compressed \
https://api.zyte.com/v1/extract \
| jq --raw-output .httpResponseBody \
| base64 --decode \
| jq --raw-output .data
import com.google.common.collect.ImmutableMap;
import com.google.gson.Gson;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Base64;
import java.util.Map;
import org.apache.hc.client5.http.classic.methods.HttpPost;
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
import org.apache.hc.client5.http.impl.classic.HttpClients;
import org.apache.hc.core5.http.ContentType;
import org.apache.hc.core5.http.HttpEntity;
import org.apache.hc.core5.http.HttpHeaders;
import org.apache.hc.core5.http.ParseException;
import org.apache.hc.core5.http.io.entity.EntityUtils;
import org.apache.hc.core5.http.io.entity.StringEntity;
class Example {
private static final String API_KEY = "YOUR_API_KEY";
public static void main(final String[] args)
throws InterruptedException, IOException, ParseException {
Map<String, Object> parameters =
ImmutableMap.of(
"url",
"https://httpbin.org/anything",
"httpResponseBody",
true,
"httpRequestMethod",
"POST",
"httpRequestBody",
"Zm9v");
String requestBody = new Gson().toJson(parameters);
HttpPost request = new HttpPost("https://api.zyte.com/v1/extract");
request.setHeader(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON);
request.setHeader(HttpHeaders.ACCEPT_ENCODING, "gzip, deflate");
request.setHeader(HttpHeaders.AUTHORIZATION, buildAuthHeader());
request.setEntity(new StringEntity(requestBody));
CloseableHttpClient client = HttpClients.createDefault();
client.execute(
request,
response -> {
HttpEntity entity = response.getEntity();
String apiResponse = EntityUtils.toString(entity, StandardCharsets.UTF_8);
JsonObject jsonObject = JsonParser.parseString(apiResponse).getAsJsonObject();
String base64HttpResponseBody = jsonObject.get("httpResponseBody").getAsString();
byte[] httpResponseBodyBytes = Base64.getDecoder().decode(base64HttpResponseBody);
String httpResponseBody = new String(httpResponseBodyBytes, StandardCharsets.UTF_8);
JsonObject data = JsonParser.parseString(httpResponseBody).getAsJsonObject();
String body = data.get("data").getAsString();
System.out.println(body);
return null;
});
}
private static String buildAuthHeader() {
String auth = API_KEY + ":";
String encodedAuth = Base64.getEncoder().encodeToString(auth.getBytes());
return "Basic " + encodedAuth;
}
}
const axios = require('axios')
axios.post(
'https://api.zyte.com/v1/extract',
{
url: 'https://httpbin.org/anything',
httpResponseBody: true,
httpRequestMethod: 'POST',
httpRequestBody: 'Zm9v'
},
{
auth: { username: 'YOUR_API_KEY' }
}
).then((response) => {
const httpResponseBody = Buffer.from(
response.data.httpResponseBody,
'base64'
)
const body = JSON.parse(httpResponseBody).data
console.log(body)
})
<?php
$client = new GuzzleHttp\Client();
$response = $client->request('POST', 'https://api.zyte.com/v1/extract', [
'auth' => ['YOUR_API_KEY', ''],
'headers' => ['Accept-Encoding' => 'gzip'],
'json' => [
'url' => 'https://httpbin.org/anything',
'httpResponseBody' => true,
'httpRequestMethod' => 'POST',
'httpRequestBody' => 'Zm9v',
],
]);
$data = json_decode($response->getBody());
$http_response_body = base64_decode($data->httpResponseBody);
$body = json_decode($http_response_body)->data;
echo $body.PHP_EOL;
With the proxy mode, the request body from your requests is used automatically, be it plain text or binary.
curl \
--proxy api.zyte.com:8011 \
--proxy-user YOUR_API_KEY: \
--compressed \
-X POST \
-H "Content-Type: application/octet-stream" \
--data foo \
https://httpbin.org/anything \
| jq .data
import json
from base64 import b64decode
import requests
api_response = requests.post(
"https://api.zyte.com/v1/extract",
auth=("YOUR_API_KEY", ""),
json={
"url": "https://httpbin.org/anything",
"httpResponseBody": True,
"httpRequestMethod": "POST",
"httpRequestBody": "Zm9v",
},
)
http_response_body = b64decode(api_response.json()["httpResponseBody"])
body: str = json.loads(http_response_body)["data"]
print(body)
import asyncio
import json
from base64 import b64decode
from zyte_api import AsyncZyteAPI
async def main():
client = AsyncZyteAPI()
api_response = await client.get(
{
"url": "https://httpbin.org/anything",
"httpResponseBody": True,
"httpRequestMethod": "POST",
"httpRequestBody": "Zm9v",
}
)
http_response_body: bytes = b64decode(api_response["httpResponseBody"])
body = json.loads(http_response_body)["data"]
print(body)
asyncio.run(main())
import json
from scrapy import Request, Spider
class HTTPBinOrgSpider(Spider):
name = "httpbin_org"
def start_requests(self):
yield Request(
"https://httpbin.org/anything",
method="POST",
body=b"foo",
)
def parse(self, response):
body = json.loads(response.body)["data"]
print(body)
Output:
foo
Request headers#
In HTTP requests, use customHttpRequestHeaders to set request
headers. You can set any header except Cookie
(see
Cookies).
Tip
You can also set headers like Accept
, Accept-Encoding
,
Accept-Language
or User-Agent
, but it is usually best to let Zyte
API set those headers; it will use values consistent with the network stack
and other request parameters (e.g. device,
geolocation).
Example
Note
Install and configure code example requirements and the Zyte CA certificate to run the example below.
using System.Collections.Generic;
using System.Net;
using System.Net.Http;
using System.Text;
using System.Text.Json;
using System.Threading.Tasks;
HttpClientHandler handler = new HttpClientHandler()
{
AutomaticDecompression = DecompressionMethods.All
};
HttpClient client = new HttpClient(handler);
var apiKey = "YOUR_API_KEY";
var bytes = Encoding.GetEncoding("ISO-8859-1").GetBytes(apiKey + ":");
var auth = System.Convert.ToBase64String(bytes);
client.DefaultRequestHeaders.Add("Authorization", "Basic " + auth);
client.DefaultRequestHeaders.Add("Accept-Encoding", "br, gzip, deflate");
var input = new Dictionary<string, object>(){
{"url", "https://httpbin.org/anything"},
{"httpResponseBody", true},
{
"customHttpRequestHeaders",
new List<Dictionary<string, object>>()
{
new Dictionary<string, object>()
{
{"name", "Accept-Language"},
{"value", "fa"}
}
}
}
};
var inputJson = JsonSerializer.Serialize(input);
var content = new StringContent(inputJson, Encoding.UTF8, "application/json");
HttpResponseMessage response = await client.PostAsync("https://api.zyte.com/v1/extract", content);
var body = await response.Content.ReadAsByteArrayAsync();
var data = JsonDocument.Parse(body);
var base64HttpResponseBody = data.RootElement.GetProperty("httpResponseBody").ToString();
var httpResponseBody = System.Convert.FromBase64String(base64HttpResponseBody);
var responseData = JsonDocument.Parse(httpResponseBody);
var headerEnumerator = responseData.RootElement.GetProperty("headers").EnumerateObject();
var headers = new Dictionary<string, string>();
while (headerEnumerator.MoveNext())
{
headers.Add(
headerEnumerator.Current.Name.ToString(),
headerEnumerator.Current.Value.ToString()
);
}
{"url": "https://httpbin.org/anything", "httpResponseBody": true, "customHttpRequestHeaders": [{"name": "Accept-Language", "value": "fa"}]}
zyte-api input.jsonl \
| jq --raw-output .httpResponseBody \
| base64 --decode \
| jq .headers
{
"url": "https://httpbin.org/anything",
"httpResponseBody": true,
"customHttpRequestHeaders": [
{
"name": "Accept-Language",
"value": "fa"
}
]
}
curl \
--user YOUR_API_KEY: \
--header 'Content-Type: application/json' \
--data @input.json \
--compressed \
https://api.zyte.com/v1/extract \
| jq --raw-output .httpResponseBody \
| base64 --decode \
| jq .headers
import com.google.common.collect.ImmutableMap;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Base64;
import java.util.Collections;
import java.util.Map;
import org.apache.hc.client5.http.classic.methods.HttpPost;
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
import org.apache.hc.client5.http.impl.classic.HttpClients;
import org.apache.hc.core5.http.ContentType;
import org.apache.hc.core5.http.HttpEntity;
import org.apache.hc.core5.http.HttpHeaders;
import org.apache.hc.core5.http.ParseException;
import org.apache.hc.core5.http.io.entity.EntityUtils;
import org.apache.hc.core5.http.io.entity.StringEntity;
class Example {
private static final String API_KEY = "YOUR_API_KEY";
public static void main(final String[] args)
throws InterruptedException, IOException, ParseException {
Map<String, Object> customHttpRequestHeader =
ImmutableMap.of("name", "Accept-Language", "value", "fa");
Map<String, Object> parameters =
ImmutableMap.of(
"url",
"https://httpbin.org/anything",
"httpResponseBody",
true,
"customHttpRequestHeaders",
Collections.singletonList(customHttpRequestHeader));
String requestBody = new Gson().toJson(parameters);
HttpPost request = new HttpPost("https://api.zyte.com/v1/extract");
request.setHeader(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON);
request.setHeader(HttpHeaders.ACCEPT_ENCODING, "gzip, deflate");
request.setHeader(HttpHeaders.AUTHORIZATION, buildAuthHeader());
request.setEntity(new StringEntity(requestBody));
CloseableHttpClient client = HttpClients.createDefault();
client.execute(
request,
response -> {
HttpEntity entity = response.getEntity();
String apiResponse = EntityUtils.toString(entity, StandardCharsets.UTF_8);
JsonObject jsonObject = JsonParser.parseString(apiResponse).getAsJsonObject();
String base64HttpResponseBody = jsonObject.get("httpResponseBody").getAsString();
byte[] httpResponseBodyBytes = Base64.getDecoder().decode(base64HttpResponseBody);
String httpResponseBody = new String(httpResponseBodyBytes, StandardCharsets.UTF_8);
JsonObject data = JsonParser.parseString(httpResponseBody).getAsJsonObject();
JsonObject headers = data.get("headers").getAsJsonObject();
Gson gson = new GsonBuilder().setPrettyPrinting().create();
System.out.println(gson.toJson(headers));
return null;
});
}
private static String buildAuthHeader() {
String auth = API_KEY + ":";
String encodedAuth = Base64.getEncoder().encodeToString(auth.getBytes());
return "Basic " + encodedAuth;
}
}
const axios = require('axios')
axios.post(
'https://api.zyte.com/v1/extract',
{
url: 'https://httpbin.org/anything',
httpResponseBody: true,
customHttpRequestHeaders: [
{
name: 'Accept-Language',
value: 'fa'
}
]
},
{
auth: { username: 'YOUR_API_KEY' }
}
).then((response) => {
const httpResponseBody = Buffer.from(
response.data.httpResponseBody,
'base64'
)
const headers = JSON.parse(httpResponseBody).headers
})
<?php
$client = new GuzzleHttp\Client();
$response = $client->request('POST', 'https://api.zyte.com/v1/extract', [
'auth' => ['YOUR_API_KEY', ''],
'headers' => ['Accept-Encoding' => 'gzip'],
'json' => [
'url' => 'https://httpbin.org/anything',
'httpResponseBody' => true,
'customHttpRequestHeaders' => [
[
'name' => 'Accept-Language',
'value' => 'fa',
],
],
],
]);
$api = json_decode($response->getBody());
$http_response_body = base64_decode($api->httpResponseBody);
$data = json_decode($http_response_body);
$headers = $data->headers;
With the proxy mode, the request headers from your requests are used automatically.
curl \
--proxy api.zyte.com:8011 \
--proxy-user YOUR_API_KEY: \
--compressed \
-H "Accept-Language: fa" \
https://httpbin.org/anything \
| jq .headers
import json
from base64 import b64decode
import requests
api_response = requests.post(
"https://api.zyte.com/v1/extract",
auth=("YOUR_API_KEY", ""),
json={
"url": "https://httpbin.org/anything",
"httpResponseBody": True,
"customHttpRequestHeaders": [
{
"name": "Accept-Language",
"value": "fa",
},
],
},
)
http_response_body = b64decode(api_response.json()["httpResponseBody"])
headers = json.loads(http_response_body)["headers"]
import asyncio
import json
from base64 import b64decode
from zyte_api import AsyncZyteAPI
async def main():
client = AsyncZyteAPI()
api_response = await client.get(
{
"url": "https://httpbin.org/anything",
"httpResponseBody": True,
"customHttpRequestHeaders": [
{
"name": "Accept-Language",
"value": "fa",
},
],
}
)
http_response_body: bytes = b64decode(api_response["httpResponseBody"])
headers = json.loads(http_response_body)["headers"]
print(json.dumps(headers, indent=2))
asyncio.run(main())
import json
from scrapy import Request, Spider
class HTTPBinOrgSpider(Spider):
name = "httpbin_org"
def start_requests(self):
yield Request(
"https://httpbin.org/anything",
headers={"Accept-Language": "fa"},
)
def parse(self, response):
headers = json.loads(response.text)["headers"]
Output (first 5 lines):
{
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "fa",
"Host": "httpbin.org",
Redirection#
HTTP requests follow HTTP redirection by default. Set
followRedirect to False
to change that.
Note
Redirection works differently in browser requests.
Device emulation#
In HTTP requests, use device to set a type of device emulation,
either desktop
(default) or mobile
, to use for your request.
This option exists because some websites return different content depending on the type of device used to access them.
Note
In a request where you set device to mobile
, you
cannot use sessionContextParameters.actions.
Example
Note
Install and configure code example requirements and the Zyte CA certificate to run the example below.
using System;
using System.Collections.Generic;
using System.Net;
using System.Net.Http;
using System.Text;
using System.Text.Json;
using System.Threading.Tasks;
HttpClientHandler handler = new HttpClientHandler()
{
AutomaticDecompression = DecompressionMethods.All
};
HttpClient client = new HttpClient(handler);
var apiKey = "YOUR_API_KEY";
var bytes = Encoding.GetEncoding("ISO-8859-1").GetBytes(apiKey + ":");
var auth = System.Convert.ToBase64String(bytes);
client.DefaultRequestHeaders.Add("Authorization", "Basic " + auth);
client.DefaultRequestHeaders.Add("Accept-Encoding", "br, gzip, deflate");
var input = new Dictionary<string, object>(){
{"url", "https://httpbin.org/user-agent"},
{"httpResponseBody", true},
{"device", "mobile"}
};
var inputJson = JsonSerializer.Serialize(input);
var content = new StringContent(inputJson, Encoding.UTF8, "application/json");
HttpResponseMessage response = await client.PostAsync("https://api.zyte.com/v1/extract", content);
var body = await response.Content.ReadAsByteArrayAsync();
var data = JsonDocument.Parse(body);
var base64HttpResponseBody = data.RootElement.GetProperty("httpResponseBody").ToString();
var httpResponseBody = System.Convert.FromBase64String(base64HttpResponseBody);
var responseData = JsonDocument.Parse(httpResponseBody);
var headerEnumerator = responseData.RootElement.EnumerateObject();
while (headerEnumerator.MoveNext())
{
if (headerEnumerator.Current.Name.ToString() == "user-agent")
{
Console.WriteLine(headerEnumerator.Current.Value.ToString());
}
}
{"url": "https://httpbin.org/user-agent", "httpResponseBody": true, "device": "mobile"}
zyte-api input.jsonl \
| jq --raw-output .httpResponseBody \
| base64 --decode \
| jq --raw-output '.["user-agent"]'
{
"url": "https://httpbin.org/user-agent",
"httpResponseBody": true,
"device": "mobile"
}
curl \
--user YOUR_API_KEY: \
--header 'Content-Type: application/json' \
--data @input.json \
--compressed \
https://api.zyte.com/v1/extract \
| jq --raw-output .httpResponseBody \
| base64 --decode \
| jq --raw-output '.["user-agent"]'
import com.google.common.collect.ImmutableMap;
import com.google.gson.Gson;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Base64;
import java.util.Map;
import org.apache.hc.client5.http.classic.methods.HttpPost;
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
import org.apache.hc.client5.http.impl.classic.HttpClients;
import org.apache.hc.core5.http.ContentType;
import org.apache.hc.core5.http.HttpEntity;
import org.apache.hc.core5.http.HttpHeaders;
import org.apache.hc.core5.http.ParseException;
import org.apache.hc.core5.http.io.entity.EntityUtils;
import org.apache.hc.core5.http.io.entity.StringEntity;
class Example {
private static final String API_KEY = "YOUR_API_KEY";
public static void main(final String[] args)
throws InterruptedException, IOException, ParseException {
Map<String, Object> parameters =
ImmutableMap.of(
"url", "https://httpbin.org/user-agent", "httpResponseBody", true, "device", "mobile");
String requestBody = new Gson().toJson(parameters);
HttpPost request = new HttpPost("https://api.zyte.com/v1/extract");
request.setHeader(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON);
request.setHeader(HttpHeaders.ACCEPT_ENCODING, "gzip, deflate");
request.setHeader(HttpHeaders.AUTHORIZATION, buildAuthHeader());
request.setEntity(new StringEntity(requestBody));
CloseableHttpClient client = HttpClients.createDefault();
client.execute(
request,
response -> {
HttpEntity entity = response.getEntity();
String apiResponse = EntityUtils.toString(entity, StandardCharsets.UTF_8);
JsonObject jsonObject = JsonParser.parseString(apiResponse).getAsJsonObject();
String base64HttpResponseBody = jsonObject.get("httpResponseBody").getAsString();
byte[] httpResponseBodyBytes = Base64.getDecoder().decode(base64HttpResponseBody);
String httpResponseBody = new String(httpResponseBodyBytes, StandardCharsets.UTF_8);
JsonObject data = JsonParser.parseString(httpResponseBody).getAsJsonObject();
String userAgent = data.get("user-agent").getAsString();
System.out.println(userAgent);
return null;
});
}
private static String buildAuthHeader() {
String auth = API_KEY + ":";
String encodedAuth = Base64.getEncoder().encodeToString(auth.getBytes());
return "Basic " + encodedAuth;
}
}
const axios = require('axios')
axios.post(
'https://api.zyte.com/v1/extract',
{
url: 'https://httpbin.org/user-agent',
httpResponseBody: true,
device: 'mobile'
},
{
auth: { username: 'YOUR_API_KEY' }
}
).then((response) => {
const httpResponseBody = Buffer.from(
response.data.httpResponseBody,
'base64'
)
console.log(JSON.parse(httpResponseBody)['user-agent'])
})
<?php
$client = new GuzzleHttp\Client();
$response = $client->request('POST', 'https://api.zyte.com/v1/extract', [
'auth' => ['YOUR_API_KEY', ''],
'headers' => ['Accept-Encoding' => 'gzip'],
'json' => [
'url' => 'https://httpbin.org/user-agent',
'httpResponseBody' => true,
'device' => 'mobile',
],
]);
$api = json_decode($response->getBody());
$http_response_body = base64_decode($api->httpResponseBody);
$data = json_decode($http_response_body);
echo $data->{'user-agent'}.PHP_EOL;
With the proxy mode, use the Zyte-Device header.
curl \
--proxy api.zyte.com:8011 \
--proxy-user YOUR_API_KEY: \
--compressed \
-H "Zyte-Device: mobile" \
https://httpbin.org/user-agent \
| jq --raw-output '.["user-agent"]'
import json
from base64 import b64decode
import requests
api_response = requests.post(
"https://api.zyte.com/v1/extract",
auth=("YOUR_API_KEY", ""),
json={
"url": "https://httpbin.org/user-agent",
"httpResponseBody": True,
"device": "mobile",
},
)
http_response_body = b64decode(api_response.json()["httpResponseBody"])
user_agent = json.loads(http_response_body)["user-agent"]
print(user_agent)
import asyncio
import json
from base64 import b64decode
from zyte_api import AsyncZyteAPI
async def main():
client = AsyncZyteAPI()
api_response = await client.get(
{
"url": "https://httpbin.org/user-agent",
"httpResponseBody": True,
"device": "mobile",
}
)
http_response_body: bytes = b64decode(api_response["httpResponseBody"])
user_agent = json.loads(http_response_body)["user-agent"]
print(user_agent)
asyncio.run(main())
import json
from scrapy import Request, Spider
class HTTPBinOrgSpider(Spider):
name = "httpbin_org"
def start_requests(self):
yield Request(
"https://httpbin.org/user-agent",
meta={
"zyte_api_automap": {
"device": "mobile",
}
},
)
def parse(self, response):
user_agent = json.loads(response.text)["user-agent"]
print(user_agent)
Example output (may vary):
Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Mobile Safari/537.36
Submitting HTML forms#
While it may be easier to submit HTML forms using a browser request with actions, it is also possible to reproduce form-submission requests with HTTP requests.
Reproducing an HTML form request usually requires:
Setting the right value of httpRequestMethod, often
POST
.Setting the
Content-Type
header toapplication/x-www-form-urlencoded
through customHttpRequestHeaders.Setting the right payload, i.e. key-value pairs set by the form.
For
GET
requests, that means setting those key-value pairs in the URL query string.For
POST
requests, that means encoding those key-value pairs as a query string (without the starting?
) and using that as httpRequestText or httpRequestBody.Tip
Your key-value pairs may need to include hidden form fields, often used for CSRF tokens or to keep the state of stateful pages (e.g. ASP.NET’s
__VIEWSTATE
field).
Example
Note
Install and configure code example requirements and the Zyte CA certificate to run the example below.
In https://quotes.toscrape.com/search.aspx you get an HTML form that could be stripped down to:
<form action="/filter.aspx" method="post" >
<select name="author">
<option>----------</option>
<option value="Albert Einstein">
Albert Einstein
</option>
<!-- [more options] -->
</select>
<select name="tag">
<option>----------</option>
</select>
<input type="hidden" name="__VIEWSTATE" value="ZTYzZDZ…">
</form>
When you select an Author (e.g. Albert Einstein), a form request is sent, and the Tag options fill up.
To reproduce that:
using System;
using System.Collections.Generic;
using System.Linq;
using System.Net;
using System.Net.Http;
using System.Text;
using System.Text.Json;
using System.Threading.Tasks;
using System.Xml.XPath;
using System.Web;
using HtmlAgilityPack;
HttpClientHandler handler = new HttpClientHandler()
{
AutomaticDecompression = DecompressionMethods.All
};
HttpClient client = new HttpClient(handler);
var apiKey = "YOUR_API_KEY";
var bytes = Encoding.GetEncoding("ISO-8859-1").GetBytes(apiKey + ":");
var auth = System.Convert.ToBase64String(bytes);
client.DefaultRequestHeaders.Add("Authorization", "Basic " + auth);
client.DefaultRequestHeaders.Add("Accept-Encoding", "br, gzip, deflate");
var input1 = new Dictionary<string, object>(){
{"url", "https://quotes.toscrape.com/search.aspx"},
{"httpResponseBody", true}
};
var inputJson1 = JsonSerializer.Serialize(input1);
var content1 = new StringContent(inputJson1, Encoding.UTF8, "application/json");
HttpResponseMessage response1 = await client.PostAsync("https://api.zyte.com/v1/extract", content1);
var body1 = await response1.Content.ReadAsByteArrayAsync();
var data1 = JsonDocument.Parse(body1);
var base64HttpResponseBody1 = data1.RootElement.GetProperty("httpResponseBody").ToString();
var httpResponseBodyBytes1 = System.Convert.FromBase64String(base64HttpResponseBody1);
var httpResponseBody1 = System.Text.Encoding.UTF8.GetString(httpResponseBodyBytes1);
var htmlDocument1 = new HtmlDocument();
htmlDocument1.LoadHtml(httpResponseBody1);
var navigator1 = htmlDocument1.CreateNavigator();
var nodeIterator = (XPathNodeIterator)navigator1.Evaluate("//*[@name='__VIEWSTATE']/@value");
nodeIterator.MoveNext();
var viewState = nodeIterator.Current.ToString();
var httpRequestTextParameters = new Dictionary<string, string>
{
{ "author", "Albert Einstein" },
{ "tag", "----------" },
{ "__VIEWSTATE", viewState}
};
var httpRequestText = string.Join("&",
httpRequestTextParameters.Select(kvp => $"{HttpUtility.UrlEncode(kvp.Key)}={HttpUtility.UrlEncode(kvp.Value)}"));
var input2 = new Dictionary<string, object>(){
{"url", "https://quotes.toscrape.com/filter.aspx"},
{"httpResponseBody", true},
{"httpRequestMethod", "POST"},
{
"customHttpRequestHeaders",
new List<Dictionary<string, object>>()
{
new Dictionary<string, object>()
{
{"name", "Content-Type"},
{"value", "application/x-www-form-urlencoded"}
}
}
},
{"httpRequestText", httpRequestText}
};
var inputJson2 = JsonSerializer.Serialize(input2);
var content2 = new StringContent(inputJson2, Encoding.UTF8, "application/json");
HttpResponseMessage response2 = await client.PostAsync("https://api.zyte.com/v1/extract", content2);
var body2 = await response2.Content.ReadAsByteArrayAsync();
var data2 = JsonDocument.Parse(body2);
var base64HttpResponseBody2 = data2.RootElement.GetProperty("httpResponseBody").ToString();
var httpResponseBodyBytes2 = System.Convert.FromBase64String(base64HttpResponseBody2);
var httpResponseBody2 = System.Text.Encoding.UTF8.GetString(httpResponseBodyBytes2);
var htmlDocument2 = new HtmlDocument();
htmlDocument2.LoadHtml(httpResponseBody2);
var navigator2 = htmlDocument2.CreateNavigator();
var nodeIterator2 = (XPathNodeIterator)navigator2.Evaluate("//*[@name='tag']//option");
int tagCount = 0;
while (nodeIterator2.MoveNext())
{
tagCount++;
}
Console.WriteLine($"{tagCount}");
import com.google.common.collect.ImmutableMap;
import com.google.gson.Gson;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Base64;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import org.apache.hc.client5.http.classic.methods.HttpPost;
import org.apache.hc.client5.http.entity.UrlEncodedFormEntity;
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
import org.apache.hc.client5.http.impl.classic.HttpClients;
import org.apache.hc.core5.http.ContentType;
import org.apache.hc.core5.http.HttpEntity;
import org.apache.hc.core5.http.HttpHeaders;
import org.apache.hc.core5.http.NameValuePair;
import org.apache.hc.core5.http.ParseException;
import org.apache.hc.core5.http.io.entity.EntityUtils;
import org.apache.hc.core5.http.io.entity.StringEntity;
import org.apache.hc.core5.http.message.BasicNameValuePair;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
class Example {
private static final String API_KEY = "YOUR_API_KEY";
public static void main(final String[] args)
throws InterruptedException, IOException, ParseException {
Map<String, Object> parameters1 =
ImmutableMap.of("url", "https://quotes.toscrape.com/search.aspx", "httpResponseBody", true);
String requestBody1 = new Gson().toJson(parameters1);
HttpPost request1 = new HttpPost("https://api.zyte.com/v1/extract");
request1.setHeader(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON);
request1.setHeader(HttpHeaders.ACCEPT_ENCODING, "gzip, deflate");
request1.setHeader(HttpHeaders.AUTHORIZATION, buildAuthHeader());
request1.setEntity(new StringEntity(requestBody1));
CloseableHttpClient client = HttpClients.createDefault();
client.execute(
request1,
(response1) -> {
HttpEntity httpEntity1 = response1.getEntity();
String httpApiResponse1 = EntityUtils.toString(httpEntity1, StandardCharsets.UTF_8);
JsonObject httpJsonObject1 = JsonParser.parseString(httpApiResponse1).getAsJsonObject();
String base64HttpResponseBody1 = httpJsonObject1.get("httpResponseBody").getAsString();
byte[] httpResponseBodyBytes1 = Base64.getDecoder().decode(base64HttpResponseBody1);
String httpResponseBody1 = new String(httpResponseBodyBytes1, StandardCharsets.UTF_8);
Document document1 = Jsoup.parse(httpResponseBody1);
String viewState = document1.select("[name='__VIEWSTATE']").attr("value");
Map<String, String> params =
ImmutableMap.of(
"author", "Albert Einstein",
"tag", "----------",
"__VIEWSTATE", viewState);
List<NameValuePair> formParams = new ArrayList<>();
for (Map.Entry<String, String> entry : params.entrySet()) {
formParams.add(new BasicNameValuePair(entry.getKey(), entry.getValue()));
}
UrlEncodedFormEntity entity =
new UrlEncodedFormEntity(formParams, StandardCharsets.UTF_8);
String httpRequestText = EntityUtils.toString(entity);
Map<String, Object> customHttpRequestHeader =
ImmutableMap.of("name", "Content-Type", "value", "application/x-www-form-urlencoded");
Map<String, Object> parameters2 =
ImmutableMap.of(
"url",
"https://quotes.toscrape.com/filter.aspx",
"httpResponseBody",
true,
"httpRequestMethod",
"POST",
"customHttpRequestHeaders",
Collections.singletonList(customHttpRequestHeader),
"httpRequestText",
httpRequestText);
String requestBody2 = new Gson().toJson(parameters2);
HttpPost request2 = new HttpPost("https://api.zyte.com/v1/extract");
request2.setHeader(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON);
request2.setHeader(HttpHeaders.ACCEPT_ENCODING, "gzip, deflate");
request2.setHeader(HttpHeaders.AUTHORIZATION, buildAuthHeader());
request2.setEntity(new StringEntity(requestBody2));
client.execute(
request2,
(response2) -> {
HttpEntity httpEntity2 = response2.getEntity();
String httpApiResponse2 = EntityUtils.toString(httpEntity2, StandardCharsets.UTF_8);
JsonObject httpJsonObject2 =
JsonParser.parseString(httpApiResponse2).getAsJsonObject();
String base64HttpResponseBody2 =
httpJsonObject2.get("httpResponseBody").getAsString();
byte[] httpResponseBodyBytes2 = Base64.getDecoder().decode(base64HttpResponseBody2);
String httpResponseBody2 =
new String(httpResponseBodyBytes2, StandardCharsets.UTF_8);
Document document2 = Jsoup.parse(httpResponseBody2);
Elements tags = document2.select("select[name='tag'] option");
System.out.println(tags.size());
return null;
});
return null;
});
}
private static String buildAuthHeader() {
String auth = API_KEY + ":";
String encodedAuth = Base64.getEncoder().encodeToString(auth.getBytes());
return "Basic " + encodedAuth;
}
}
const axios = require('axios')
const cheerio = require('cheerio')
const querystring = require('querystring')
axios.post(
'https://api.zyte.com/v1/extract',
{
url: 'https://quotes.toscrape.com/search.aspx',
httpResponseBody: true
},
{
auth: { username: 'YOUR_API_KEY' }
}
).then((response) => {
const httpResponseBody = Buffer.from(
response.data.httpResponseBody,
'base64'
)
const $ = cheerio.load(httpResponseBody)
const viewState = $('[name="__VIEWSTATE"]').get(0).attribs.value
const httpRequestText = querystring.stringify(
{
author: 'Albert Einstein',
tag: '----------',
__VIEWSTATE: viewState
}
)
axios.post(
'https://api.zyte.com/v1/extract',
{
url: 'https://quotes.toscrape.com/filter.aspx',
httpResponseBody: true,
httpRequestMethod: 'POST',
customHttpRequestHeaders: [
{
name: 'Content-Type',
value: 'application/x-www-form-urlencoded'
}
],
httpRequestText
},
{
auth: { username: 'YOUR_API_KEY' }
}
).then((response) => {
const httpResponseBody = Buffer.from(
response.data.httpResponseBody,
'base64'
)
const $ = cheerio.load(httpResponseBody)
console.log($('select[name="tag"] option').length)
})
})
<?php
$client = new GuzzleHttp\Client();
$response_1 = $client->request('POST', 'https://api.zyte.com/v1/extract', [
'auth' => ['YOUR_API_KEY', ''],
'headers' => ['Accept-Encoding' => 'gzip'],
'json' => [
'url' => 'https://quotes.toscrape.com/search.aspx',
'httpResponseBody' => true,
],
]);
$data = json_decode($response_1->getBody());
$http_response_body = base64_decode($data->httpResponseBody);
$doc = new DOMDocument();
$doc->loadHTML($http_response_body);
$xpath_1 = new DOMXPath($doc);
$view_state = $xpath_1->query('//*[@name="__VIEWSTATE"]/@value')->item(0)->nodeValue;
$http_request_text = http_build_query(
[
'author' => 'Albert Einstein',
'tag' => '----------',
'__VIEWSTATE' => $view_state,
]
);
$response_2 = $client->request('POST', 'https://api.zyte.com/v1/extract', [
'auth' => ['YOUR_API_KEY', ''],
'headers' => ['Accept-Encoding' => 'gzip'],
'json' => [
'url' => 'https://quotes.toscrape.com/filter.aspx',
'httpResponseBody' => true,
'httpRequestMethod' => 'POST',
'customHttpRequestHeaders' => [
[
'name' => 'Content-Type',
'value' => 'application/x-www-form-urlencoded',
],
],
'httpRequestText' => $http_request_text,
],
]);
$data = json_decode($response_2->getBody());
$http_response_body = base64_decode($data->httpResponseBody);
$doc->loadHTML($http_response_body);
$xpath_2 = new DOMXPath($doc);
$tags = $xpath_2->query('//*[@name="tag"]/option');
echo count($tags).PHP_EOL;
Install form2request, which makes it easier to handle HTML forms in Python.
Then:
from base64 import b64decode
from form2request import form2request
from parsel import Selector
import requests
api_response_1 = requests.post(
"https://api.zyte.com/v1/extract",
auth=("YOUR_API_KEY", ""),
json={
"url": "https://quotes.toscrape.com/search.aspx",
"httpResponseBody": True,
},
)
api_response_1_data = api_response_1.json()
http_response_body_1 = b64decode(api_response_1_data["httpResponseBody"])
selector_1 = Selector(body=http_response_body_1, base_url=api_response_1_data["url"])
form = selector_1.css("form")
request = form2request(form, {"author": "Albert Einstein"}, click=False)
api_response_2 = requests.post(
"https://api.zyte.com/v1/extract",
auth=("YOUR_API_KEY", ""),
json={
"url": request.url,
"httpRequestMethod": request.method,
"customHttpRequestHeaders": [
{"name": k, "value": v} for k, v in request.headers
],
"httpRequestText": request.body.decode(),
"httpResponseBody": True,
},
)
http_response_body_2 = b64decode(api_response_2.json()["httpResponseBody"])
selector_2 = Selector(body=http_response_body_2)
print(len(selector_2.css("select[name='tag'] option")))
Install form2request, which makes it easier to handle HTML forms in Python.
Then:
import asyncio
from base64 import b64decode
from form2request import form2request
from parsel import Selector
from zyte_api import AsyncZyteAPI
async def main():
client = AsyncZyteAPI()
api_response_1 = await client.get(
{
"url": "https://quotes.toscrape.com/search.aspx",
"httpResponseBody": True,
}
)
http_response_body_1 = b64decode(api_response_1["httpResponseBody"])
selector_1 = Selector(body=http_response_body_1, base_url=api_response_1["url"])
form = selector_1.css("form")
request = form2request(form, {"author": "Albert Einstein"}, click=False)
api_response_2 = await client.get(
{
"url": request.url,
"httpRequestMethod": request.method,
"customHttpRequestHeaders": [
{"name": k, "value": v} for k, v in request.headers
],
"httpRequestText": request.body.decode(),
"httpResponseBody": True,
}
)
http_response_body_2 = b64decode(api_response_2["httpResponseBody"])
selector_2 = Selector(body=http_response_body_2)
print(len(selector_2.css("select[name='tag'] option")))
asyncio.run(main())
Install form2request, which makes it easier to handle HTML forms in Scrapy.
Then, use it and let transparent mode take care of the rest:
from form2request import form2request
from scrapy import Spider
class QuotesToScrapeComSpider(Spider):
name = "quotes_toscrape_com"
start_urls = ["https://quotes.toscrape.com/search.aspx"]
def parse(self, response):
form = response.css("form")
request = form2request(form, {"author": "Albert Einstein"}, click=False)
yield request.to_scrapy(callback=self.parse_tags)
def parse_tags(self, response):
print(len(response.css("select[name='tag'] option")))
Output (number of Tag options):
25
Decoding HTML#
HTML extracted as a response body needs to be decoded.
HTML content can be encoded with one of many character encodings, and you must determine the character encoding used so that you can decode that HTML content accordingly.
The best way to determine the encoding of HTML content is to follow the encoding sniffing algorithm defined in the HTML standard.
In addition to the HTML content, the HTML encoding sniffing algorithm takes
into account any character encoding provided in the optional charset
parameter of media types declared in the Content-Type
response header, so
make sure you get the response headers in
addition to the response body if you are following the HTML encoding sniffing
algorithm.
Example
Note
Install and configure code example requirements and the Zyte CA certificate to run the example below.
Use file to find the media type of a previously-downloaded response based solely on its body (i.e. not following the HTML encoding sniffing algorithm).
file --mime-encoding output.html
Use content-type-parser, html-encoding-sniffer and whatwg-encoding:
const contentTypeParser = require('content-type-parser')
const htmlEncodingSniffer = require('html-encoding-sniffer')
const whatwgEncoding = require('whatwg-encoding')
// …
const httpResponseHeaders = response.data.httpResponseHeaders
let contentTypeCharset
httpResponseHeaders.forEach(function (item) {
if (item.name.toLowerCase() === 'content-type') {
contentTypeCharset = contentTypeParser(item.value).get('charset')
}
})
const httpResponseBody = Buffer.from(response.data.httpResponseBody, 'base64')
const encoding = htmlEncodingSniffer(httpResponseBody, {
transportLayerEncodingLabel: contentTypeCharset
})
const html = whatwgEncoding.decode(httpResponseBody, encoding)
web-poet provides a response wrapper that automatically decodes the response body following an encoding sniffing algorithm similar to the one defined in the HTML standard.
Provided that you have extracted a response with both body and headers, and you have Base64-decoded the response body, you can decode the HTML bytes as follows:
from web_poet import HttpResponse
# …
headers = tuple(
(item['name'], item['value'])
for item in http_response_headers
)
response = HttpResponse(
url='https://example.com',
body=http_response_body,
status=200,
headers=headers,
)
html = response.text
In transparent mode, regular Scrapy requests targeting HTML resources decode them by default. See Zyte API HTTP requests.
HTML and browser HTML#
HTML found in httpResponseBody is usually different from HTML found in browserHtml (browser HTML):
httpResponseBody does not reflect changes that a webpage makes at run time using JavaScript, such as loading content from additional URLs, or moving or reformatting content within the webpage.
browserHtml includes a normalization of the HTML from the underlying HTTP response, which web browsers perform according to the HTML5 specification. So the content of HTML and browser HTML could be different even when there is no JavaScript involved.
Parsing HTML from httpResponseBody with libraries that do not implement HTML5 parsing, such as lxml.html (used by Scrapy by default), results in a different tree structure.
With an HTML5-compatible parser the resulting tree structure would be the same, provided JavaScript does not cause any other difference.
Because of these differences, switching between these HTML inputs can break your existing parsing code and require changes, such as updating XPath or CSS selectors.