Zyte API browser automation#
To use a browser through Zyte API, enable browserHtml, screenshot, or both.
For browser requests, Zyte API also supports:
Unlike HTTP requests, browser requests do not support:
An HTTP request method or body.
Note
This only affects the initial request. During a browser request, as a result of redirection, JavaScript, or actions, additional requests may be sent with no limitation on method or body.
Returning non-HTML response data, other than a screenshot.
Browser HTML#
Browser HTML is the HTML representation of the Document Object Model (DOM) of a webpage after it has been rendered in a browser.
To get browser HTML, set the browserHtml
key in your API request body
to true
. The browserHtml
key of the response JSON object is the browser
HTML as a string.
Example
Note
Install and configure code example requirements to run the example below.
using System.Collections.Generic;
using System.Net;
using System.Net.Http;
using System.Text;
using System.Text.Json;
using System.Threading.Tasks;
HttpClientHandler handler = new HttpClientHandler()
{
AutomaticDecompression = DecompressionMethods.All
};
HttpClient client = new HttpClient(handler);
var apiKey = "YOUR_API_KEY";
var bytes = Encoding.GetEncoding("ISO-8859-1").GetBytes(apiKey + ":");
var auth = System.Convert.ToBase64String(bytes);
client.DefaultRequestHeaders.Add("Authorization", "Basic " + auth);
client.DefaultRequestHeaders.Add("Accept-Encoding", "br, gzip, deflate");
var input = new Dictionary<string, object>(){
{"url", "https://toscrape.com"},
{"browserHtml", true}
};
var inputJson = JsonSerializer.Serialize(input);
var content = new StringContent(inputJson, Encoding.UTF8, "application/json");
HttpResponseMessage response = await client.PostAsync("https://api.zyte.com/v1/extract", content);
var body = await response.Content.ReadAsByteArrayAsync();
var data = JsonDocument.Parse(body);
var browserHtml = data.RootElement.GetProperty("browserHtml").ToString();
{"url": "https://toscrape.com", "browserHtml": true}
curl \
--user YOUR_API_KEY: \
--header 'Content-Type: application/json' \
--data @input.json \
--compressed \
https://api.zyte.com/v1/extract \
| jq --raw-output .browserHtml
{"url": "https://toscrape.com", "browserHtml": true}
zyte-api input.jsonl 2> /dev/null \
| jq --raw-output .browserHtml
import com.google.common.collect.ImmutableMap;
import com.google.gson.Gson;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Base64;
import java.util.Map;
import org.apache.hc.client5.http.classic.methods.HttpPost;
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
import org.apache.hc.client5.http.impl.classic.CloseableHttpResponse;
import org.apache.hc.client5.http.impl.classic.HttpClients;
import org.apache.hc.core5.http.ContentType;
import org.apache.hc.core5.http.HttpEntity;
import org.apache.hc.core5.http.HttpHeaders;
import org.apache.hc.core5.http.ParseException;
import org.apache.hc.core5.http.io.entity.EntityUtils;
import org.apache.hc.core5.http.io.entity.StringEntity;
class Example {
private static final String API_KEY = "YOUR_API_KEY";
public static void main(final String[] args)
throws InterruptedException, IOException, ParseException {
Map<String, Object> parameters =
ImmutableMap.of("url", "https://toscrape.com", "browserHtml", true);
String requestBody = new Gson().toJson(parameters);
HttpPost request = new HttpPost("https://api.zyte.com/v1/extract");
request.setHeader(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON);
request.setHeader(HttpHeaders.ACCEPT_ENCODING, "gzip, deflate");
request.setHeader(HttpHeaders.AUTHORIZATION, buildAuthHeader());
request.setEntity(new StringEntity(requestBody));
try (CloseableHttpClient client = HttpClients.createDefault()) {
try (CloseableHttpResponse response = client.execute(request)) {
HttpEntity entity = response.getEntity();
String apiResponse = EntityUtils.toString(entity, StandardCharsets.UTF_8);
JsonObject jsonObject = JsonParser.parseString(apiResponse).getAsJsonObject();
String browserHtml = jsonObject.get("browserHtml").getAsString();
}
}
}
private static String buildAuthHeader() {
String auth = API_KEY + ":";
String encodedAuth = Base64.getEncoder().encodeToString(auth.getBytes());
return "Basic " + encodedAuth;
}
}
const axios = require('axios')
axios.post(
'https://api.zyte.com/v1/extract',
{
url: 'https://toscrape.com',
browserHtml: true
},
{
auth: { username: 'YOUR_API_KEY' }
}
).then((response) => {
const browserHtml = response.data.browserHtml
})
<?php
$client = new GuzzleHttp\Client();
$response = $client->request('POST', 'https://api.zyte.com/v1/extract', [
'auth' => ['YOUR_API_KEY', ''],
'headers' => ['Accept-Encoding' => 'gzip'],
'json' => [
'url' => 'https://toscrape.com',
'browserHtml' => true,
],
]);
$api = json_decode($response->getBody());
$browser_html = $api->browserHtml;
import requests
api_response = requests.post(
'https://api.zyte.com/v1/extract',
auth=('YOUR_API_KEY', ''),
json={
'url': 'https://toscrape.com',
'browserHtml': True,
},
)
browser_html: str = api_response.json()['browserHtml']
import asyncio
from zyte_api.aio.client import AsyncClient
async def main():
client = AsyncClient()
api_response = await client.request_raw(
{
'url': 'https://toscrape.com',
'browserHtml': True,
}
)
browser_html: str = api_response['browserHtml']
asyncio.run(main())
from scrapy import Request, Spider
class ToScrapeSpider(Spider):
name = "toscrape_com"
def start_requests(self):
yield Request(
"https://toscrape.com",
meta={
"zyte_api_automap": {
"browserHtml": True,
},
},
)
def parse(self, response):
browser_html: str = response.text
Note
Browser HTML does not include the contents of iframes or the shadow DOM.
Screenshot#
You can set the screenshot
key in your API request body to true
to
extract a screenshot.
The screenshot
key of the response JSON object is the Base64-encoded
screenshot.
Example
Note
Install and configure code example requirements to run the example below.
using System.Collections.Generic;
using System.Net;
using System.Net.Http;
using System.Text;
using System.Text.Json;
using System.Threading.Tasks;
HttpClientHandler handler = new HttpClientHandler()
{
AutomaticDecompression = DecompressionMethods.All
};
HttpClient client = new HttpClient(handler);
var apiKey = "YOUR_API_KEY";
var bytes = Encoding.GetEncoding("ISO-8859-1").GetBytes(apiKey + ":");
var auth = System.Convert.ToBase64String(bytes);
client.DefaultRequestHeaders.Add("Authorization", "Basic " + auth);
client.DefaultRequestHeaders.Add("Accept-Encoding", "br, gzip, deflate");
var input = new Dictionary<string, object>(){
{"url", "https://toscrape.com"},
{"screenshot", true}
};
var inputJson = JsonSerializer.Serialize(input);
var content = new StringContent(inputJson, Encoding.UTF8, "application/json");
HttpResponseMessage response = await client.PostAsync("https://api.zyte.com/v1/extract", content);
var body = await response.Content.ReadAsByteArrayAsync();
var data = JsonDocument.Parse(body);
var base64Screenshot = data.RootElement.GetProperty("screenshot").ToString();
var screenshot = System.Convert.FromBase64String(base64Screenshot);
{"url": "https://toscrape.com", "screenshot": true}
curl \
--user YOUR_API_KEY: \
--header 'Content-Type: application/json' \
--data @input.json \
--compressed \
https://api.zyte.com/v1/extract \
| jq --raw-output .screenshot \
| base64 --decode \
> screenshot.jpg
{"url": "https://toscrape.com", "screenshot": true}
zyte-api input.jsonl 2> /dev/null \
| jq --raw-output .screenshot \
| base64 --decode \
> screenshot.jpg
import com.google.common.collect.ImmutableMap;
import com.google.gson.Gson;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Base64;
import java.util.Map;
import org.apache.hc.client5.http.classic.methods.HttpPost;
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
import org.apache.hc.client5.http.impl.classic.CloseableHttpResponse;
import org.apache.hc.client5.http.impl.classic.HttpClients;
import org.apache.hc.core5.http.ContentType;
import org.apache.hc.core5.http.HttpEntity;
import org.apache.hc.core5.http.HttpHeaders;
import org.apache.hc.core5.http.ParseException;
import org.apache.hc.core5.http.io.entity.EntityUtils;
import org.apache.hc.core5.http.io.entity.StringEntity;
class Example {
private static final String API_KEY = "YOUR_API_KEY";
public static void main(final String[] args)
throws InterruptedException, IOException, ParseException {
Map<String, Object> parameters =
ImmutableMap.of("url", "https://toscrape.com", "screenshot", true);
String requestBody = new Gson().toJson(parameters);
HttpPost request = new HttpPost("https://api.zyte.com/v1/extract");
request.setHeader(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON);
request.setHeader(HttpHeaders.ACCEPT_ENCODING, "gzip, deflate");
request.setHeader(HttpHeaders.AUTHORIZATION, buildAuthHeader());
request.setEntity(new StringEntity(requestBody));
try (CloseableHttpClient client = HttpClients.createDefault()) {
try (CloseableHttpResponse response = client.execute(request)) {
HttpEntity entity = response.getEntity();
String apiResponse = EntityUtils.toString(entity, StandardCharsets.UTF_8);
JsonObject jsonObject = JsonParser.parseString(apiResponse).getAsJsonObject();
String base64Screenshot = jsonObject.get("screenshot").getAsString();
byte[] screenshot = Base64.getDecoder().decode(base64Screenshot);
}
}
}
private static String buildAuthHeader() {
String auth = API_KEY + ":";
String encodedAuth = Base64.getEncoder().encodeToString(auth.getBytes());
return "Basic " + encodedAuth;
}
}
const axios = require('axios')
axios.post(
'https://api.zyte.com/v1/extract',
{
url: 'https://toscrape.com',
screenshot: true
},
{
auth: { username: 'YOUR_API_KEY' }
}
).then((response) => {
const screenshot = Buffer.from(response.data.screenshot, 'base64')
})
<?php
$client = new GuzzleHttp\Client();
$response = $client->request('POST', 'https://api.zyte.com/v1/extract', [
'auth' => ['YOUR_API_KEY', ''],
'headers' => ['Accept-Encoding' => 'gzip'],
'json' => [
'url' => 'https://toscrape.com',
'screenshot' => true,
],
]);
$api = json_decode($response->getBody());
$screenshot = base64_decode($api->screenshot);
from base64 import b64decode
import requests
api_response = requests.post(
'https://api.zyte.com/v1/extract',
auth=('YOUR_API_KEY', ''),
json={
'url': 'https://toscrape.com',
'screenshot': True,
},
)
screenshot: bytes = b64decode(api_response.json()['screenshot'])
import asyncio
from base64 import b64decode
from zyte_api.aio.client import AsyncClient
async def main():
client = AsyncClient()
api_response = await client.request_raw(
{
'url': 'https://toscrape.com',
'screenshot': True,
}
)
screenshot: bytes = b64decode(api_response['screenshot'])
asyncio.run(main())
from base64 import b64decode
from scrapy import Request, Spider
class ToScrapeComSpider(Spider):
name = "toscrape_com"
def start_requests(self):
yield Request(
"https://toscrape.com",
meta={
"zyte_api_automap": {
"screenshot": True,
},
},
)
def parse(self, response):
screenshot: bytes = b64decode(response.raw_api_response["screenshot"])
You may also define an screenshotOptions
key in your API request body to
configure the format and scope of the screenshot. For more information, look up
screenshotOptions
in Zyte API reference documentation.
Actions#
When using browser automation, you can use the actions
key in your API
request body to define a sequence of actions to perform during browser
rendering, and hence modify the DOM before the requested output is generated
for you.
Example
Note
Install and configure code example requirements to run the example below.
using System.Collections.Generic;
using System.Net;
using System.Net.Http;
using System.Text;
using System.Text.Json;
using System.Threading.Tasks;
using HtmlAgilityPack;
HttpClientHandler handler = new HttpClientHandler()
{
AutomaticDecompression = DecompressionMethods.All
};
HttpClient client = new HttpClient(handler);
var apiKey = "YOUR_API_KEY";
var bytes = Encoding.GetEncoding("ISO-8859-1").GetBytes(apiKey + ":");
var auth = System.Convert.ToBase64String(bytes);
client.DefaultRequestHeaders.Add("Authorization", "Basic " + auth);
client.DefaultRequestHeaders.Add("Accept-Encoding", "br, gzip, deflate");
var input = new Dictionary<string, object>(){
{"url", "https://quotes.toscrape.com/scroll"},
{"browserHtml", true},
{
"actions",
new List<Dictionary<string, object>>()
{
new Dictionary<string, object>()
{
{"action", "scrollBottom"}
}
}
}
};
var inputJson = JsonSerializer.Serialize(input);
var content = new StringContent(inputJson, Encoding.UTF8, "application/json");
HttpResponseMessage response = await client.PostAsync("https://api.zyte.com/v1/extract", content);
var body = await response.Content.ReadAsByteArrayAsync();
var data = JsonDocument.Parse(body);
var browserHtml = data.RootElement.GetProperty("browserHtml").ToString();
var htmlDocument = new HtmlDocument();
htmlDocument.LoadHtml(browserHtml);
var navigator = htmlDocument.CreateNavigator();
var quoteCount = (double)navigator.Evaluate("count(//*[@class=\"quote\"])");
{
"url": "https://quotes.toscrape.com/scroll",
"browserHtml": true,
"actions": [
{
"action": "scrollBottom"
}
]
}
curl \
--user YOUR_API_KEY: \
--header 'Content-Type: application/json' \
--data @input.json \
--compressed \
https://api.zyte.com/v1/extract \
| jq --raw-output .browserHtml \
| xmllint --html --xpath 'count(//*[@class="quote"])' - 2> /dev/null
{"url":"https://quotes.toscrape.com/scroll","browserHtml":true,"actions":[{"action":"scrollBottom"}]}
zyte-api input.jsonl 2> /dev/null \
| jq --raw-output .browserHtml \
| xmllint --html --xpath 'count(//*[@class="quote"])' - 2> /dev/null
import com.google.common.collect.ImmutableMap;
import com.google.gson.Gson;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Base64;
import java.util.Collections;
import java.util.Map;
import org.apache.hc.client5.http.classic.methods.HttpPost;
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
import org.apache.hc.client5.http.impl.classic.CloseableHttpResponse;
import org.apache.hc.client5.http.impl.classic.HttpClients;
import org.apache.hc.core5.http.ContentType;
import org.apache.hc.core5.http.HttpEntity;
import org.apache.hc.core5.http.HttpHeaders;
import org.apache.hc.core5.http.ParseException;
import org.apache.hc.core5.http.io.entity.EntityUtils;
import org.apache.hc.core5.http.io.entity.StringEntity;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
class Example {
private static final String API_KEY = "YOUR_API_KEY";
public static void main(final String[] args)
throws InterruptedException, IOException, ParseException {
Map<String, Object> action = ImmutableMap.of("action", "scrollBottom");
Map<String, Object> parameters =
ImmutableMap.of(
"url",
"https://quotes.toscrape.com/scroll",
"browserHtml",
true,
"actions",
Collections.singletonList(action));
String requestBody = new Gson().toJson(parameters);
HttpPost request = new HttpPost("https://api.zyte.com/v1/extract");
request.setHeader(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON);
request.setHeader(HttpHeaders.ACCEPT_ENCODING, "gzip, deflate");
request.setHeader(HttpHeaders.AUTHORIZATION, buildAuthHeader());
request.setEntity(new StringEntity(requestBody));
try (CloseableHttpClient client = HttpClients.createDefault()) {
try (CloseableHttpResponse response = client.execute(request)) {
HttpEntity entity = response.getEntity();
String apiResponse = EntityUtils.toString(entity, StandardCharsets.UTF_8);
JsonObject jsonObject = JsonParser.parseString(apiResponse).getAsJsonObject();
String browserHtml = jsonObject.get("browserHtml").getAsString();
Document document = Jsoup.parse(browserHtml);
int quoteCount = document.select(".quote").size();
}
}
}
private static String buildAuthHeader() {
String auth = API_KEY + ":";
String encodedAuth = Base64.getEncoder().encodeToString(auth.getBytes());
return "Basic " + encodedAuth;
}
}
const axios = require('axios')
const cheerio = require('cheerio')
axios.post(
'https://api.zyte.com/v1/extract',
{
url: 'https://quotes.toscrape.com/scroll',
browserHtml: true,
actions: [
{
action: 'scrollBottom'
}
]
},
{
auth: { username: 'YOUR_API_KEY' }
}
).then((response) => {
const browserHtml = response.data.browserHtml
const $ = cheerio.load(browserHtml)
const quoteCount = $('.quote').length
})
<?php
$client = new GuzzleHttp\Client();
$response = $client->request('POST', 'https://api.zyte.com/v1/extract', [
'auth' => ['YOUR_API_KEY', ''],
'headers' => ['Accept-Encoding' => 'gzip'],
'json' => [
'url' => 'https://quotes.toscrape.com/scroll',
'browserHtml' => true,
'actions' => [
['action' => 'scrollBottom'],
],
],
]);
$data = json_decode($response->getBody());
$doc = new DOMDocument();
$doc->loadHTML($data->browserHtml);
$xpath = new DOMXPath($doc);
$quote_count = $xpath->query("//*[@class='quote']")->count();
import json
import requests
from parsel import Selector
api_response = requests.post(
'https://api.zyte.com/v1/extract',
auth=('YOUR_API_KEY', ''),
json={
'url': 'https://quotes.toscrape.com/scroll',
'browserHtml': True,
'actions': [
{
'action': 'scrollBottom',
},
],
},
)
browser_html = api_response.json()['browserHtml']
quote_count = len(Selector(browser_html).css('.quote'))
import asyncio
import json
from parsel import Selector
from zyte_api.aio.client import AsyncClient
async def main():
client = AsyncClient()
api_response = await client.request_raw(
{
'url': 'https://quotes.toscrape.com/scroll',
'browserHtml': True,
'actions': [
{
'action': 'scrollBottom',
},
],
},
)
browser_html = api_response['browserHtml']
quote_count = len(Selector(browser_html).css('.quote'))
asyncio.run(main())
from scrapy import Request, Spider
class QuotesToScrapeComSpider(Spider):
name = "quotes_toscrape_com"
def start_requests(self):
yield Request(
"https://quotes.toscrape.com/scroll",
meta={
"zyte_api_automap": {
"browserHtml": True,
"actions": [
{
"action": "scrollBottom",
},
],
},
},
)
def parse(self, response):
quote_count = len(response.css(".quote"))
Look up actions
in the specification for the
complete actions API.
Action types#
Zyte API supports 3 types of browser actions:
Generic actions work on every website. They allow you to type text into input fields, perform cursor actions (click, hover, scroll), and wait for certain events or for a given time.
Special actions expose functionality that requires specific knowledge of the target website, such as using their search box or filling a form.
They are only available for certain websites. To find out if an action is available for a given website, send a test request using that action. If the action is not supported, you will get an error API response indicating so.
Browser scripts, available for Enterprise plans.
Action limits#
You are free to use as many actions as you wish, but total browser execution time is limited to 60 seconds. If your actions are still running by that time, the on-going action is interrupted, follow-up actions are not executed at all, and you get your requested output (browser HTML, screenshot) as it was rendered at that time.
The Zyte API response includes an action
key that provides details about
action execution, including elapsedTime
, error
, and status
fields
to help you debug your actions, e.g. to find out which actions were executed
successfully and which actions were not.
Action selectors#
Browser actions that interact with a webpage element all have a selector
key that allows you to define how to find the target webpage element.
You must define a query to find the target webpage element in the
selector.value
field.
You must specify the language of your query in the selector.type
field,
which supports the following values: CSS Selector (css
), XPath 1.0
(xpath
). For information about these query languages, see Learning CSS and
XPath.
Note that selectors have some limitations:
Selectors cannot interact with iframes. Only browser scripts can.
Selectors cannot access the shadow DOM.
Wait actions#
You can use the following browser actions to introduce wait times in your
browser action sequences or in your browser scripts:
waitForSelector
, waitForRequest
, waitForResponse
, and
waitForTimeout
.
Whenever you need to wait for something to happen on a webpage, your should
consider using waitForSelector
first. It waits for an element matching a
given selector. By default, it waits for a matching
visible element, but you can change selector.state
to attached
, to
wait for an element to exist regardless of visibility, or to hidden
, to
wait for a matching invisible element.
waitForRequest
and waitForResponse
wait for a request to be sent or for
a response to be received, filtering by URL pattern.
waitForTimeout
pauses your sequence of actions or your browser script for
the specified amount of time. Because action run time is limited, you should avoid using this type of action when an
alternative waiting action can replace it. However, this action can be
necessary for certain scenarios, such as simulating human reaction time.
Request headers#
When extracting browser HTML or a
screenshot, you can set the requestHeaders
key
in your API request body to an object where keys are camelCase header names and
values are header values, representing headers to include in your request.
Example
Note
Install and configure code example requirements to run the example below.
using System.Collections.Generic;
using System.Net;
using System.Net.Http;
using System.Text;
using System.Text.Json;
using System.Threading.Tasks;
using System.Xml.XPath;
using HtmlAgilityPack;
HttpClientHandler handler = new HttpClientHandler()
{
AutomaticDecompression = DecompressionMethods.All
};
HttpClient client = new HttpClient(handler);
var apiKey = "YOUR_API_KEY";
var bytes = Encoding.GetEncoding("ISO-8859-1").GetBytes(apiKey + ":");
var auth = System.Convert.ToBase64String(bytes);
client.DefaultRequestHeaders.Add("Authorization", "Basic " + auth);
client.DefaultRequestHeaders.Add("Accept-Encoding", "br, gzip, deflate");
var input = new Dictionary<string, object>(){
{"url", "https://httpbin.org/anything"},
{"browserHtml", true},
{
"requestHeaders",
new Dictionary<string, object>()
{
{"referer", "https://example.org/"}
}
}
};
var inputJson = JsonSerializer.Serialize(input);
var content = new StringContent(inputJson, Encoding.UTF8, "application/json");
HttpResponseMessage response = await client.PostAsync("https://api.zyte.com/v1/extract", content);
var body = await response.Content.ReadAsByteArrayAsync();
var data = JsonDocument.Parse(body);
var browserHtml = data.RootElement.GetProperty("browserHtml").ToString();
var htmlDocument = new HtmlDocument();
htmlDocument.LoadHtml(browserHtml);
var navigator = htmlDocument.CreateNavigator();
var nodeIterator = (XPathNodeIterator)navigator.Evaluate("//text()");
nodeIterator.MoveNext();
var responseJson = nodeIterator.Current.ToString();
var responseData = JsonDocument.Parse(responseJson);
var headerEnumerator = responseData.RootElement.GetProperty("headers").EnumerateObject();
var headers = new Dictionary<string, string>();
while (headerEnumerator.MoveNext())
{
headers.Add(
headerEnumerator.Current.Name.ToString(),
headerEnumerator.Current.Value.ToString()
);
}
{
"url": "https://httpbin.org/anything",
"browserHtml": true,
"requestHeaders": {
"referer": "https://example.org/"
}
}
curl \
--user YOUR_API_KEY: \
--header 'Content-Type: application/json' \
--data @input.json \
--compressed \
https://api.zyte.com/v1/extract \
| jq --raw-output .browserHtml \
| xmllint --html --xpath '//text()' - 2> /dev/null \
| jq .headers
{"url": "https://httpbin.org/anything", "browserHtml": true, "requestHeaders": {"referer": "https://example.org/"}}
zyte-api input.jsonl 2> /dev/null \
| jq --raw-output .browserHtml \
| xmllint --html --xpath '//text()' - 2> /dev/null \
| jq .headers
import com.google.common.collect.ImmutableMap;
import com.google.gson.Gson;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Base64;
import java.util.Map;
import org.apache.hc.client5.http.classic.methods.HttpPost;
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
import org.apache.hc.client5.http.impl.classic.CloseableHttpResponse;
import org.apache.hc.client5.http.impl.classic.HttpClients;
import org.apache.hc.core5.http.ContentType;
import org.apache.hc.core5.http.HttpEntity;
import org.apache.hc.core5.http.HttpHeaders;
import org.apache.hc.core5.http.ParseException;
import org.apache.hc.core5.http.io.entity.EntityUtils;
import org.apache.hc.core5.http.io.entity.StringEntity;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
class Example {
private static final String API_KEY = "YOUR_API_KEY";
public static void main(final String[] args)
throws InterruptedException, IOException, ParseException {
Map<String, Object> requestHeaders = ImmutableMap.of("referer", "https://example.org/");
Map<String, Object> parameters =
ImmutableMap.of(
"url",
"https://httpbin.org/anything",
"browserHtml",
true,
"requestHeaders",
requestHeaders);
String requestBody = new Gson().toJson(parameters);
HttpPost request = new HttpPost("https://api.zyte.com/v1/extract");
request.setHeader(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON);
request.setHeader(HttpHeaders.ACCEPT_ENCODING, "gzip, deflate");
request.setHeader(HttpHeaders.AUTHORIZATION, buildAuthHeader());
request.setEntity(new StringEntity(requestBody));
try (CloseableHttpClient client = HttpClients.createDefault()) {
try (CloseableHttpResponse response = client.execute(request)) {
HttpEntity entity = response.getEntity();
String apiResponse = EntityUtils.toString(entity, StandardCharsets.UTF_8);
JsonObject jsonObject = JsonParser.parseString(apiResponse).getAsJsonObject();
String browserHtml = jsonObject.get("browserHtml").getAsString();
Document document = Jsoup.parse(browserHtml);
JsonObject data = JsonParser.parseString(document.text()).getAsJsonObject();
JsonObject headers = data.get("headers").getAsJsonObject();
}
}
}
private static String buildAuthHeader() {
String auth = API_KEY + ":";
String encodedAuth = Base64.getEncoder().encodeToString(auth.getBytes());
return "Basic " + encodedAuth;
}
}
const axios = require('axios')
const cheerio = require('cheerio')
axios.post(
'https://api.zyte.com/v1/extract',
{
url: 'https://httpbin.org/anything',
browserHtml: true,
requestHeaders: {
referer: 'https://example.org/'
}
},
{
auth: { username: 'YOUR_API_KEY' }
}
).then((response) => {
const $ = cheerio.load(response.data.browserHtml)
const data = JSON.parse($.text())
const headers = data.headers
})
<?php
$client = new GuzzleHttp\Client();
$response = $client->request('POST', 'https://api.zyte.com/v1/extract', [
'auth' => ['YOUR_API_KEY', ''],
'headers' => ['Accept-Encoding' => 'gzip'],
'json' => [
'url' => 'https://httpbin.org/anything',
'browserHtml' => true,
'requestHeaders' => [
'referer' => 'https://example.org/',
],
],
]);
$api = json_decode($response->getBody());
$doc = new DOMDocument();
$doc->loadHTML($api->browserHtml);
$data = json_decode($doc->textContent);
$headers = $data->headers;
import json
import requests
from parsel import Selector
api_response = requests.post(
'https://api.zyte.com/v1/extract',
auth=('YOUR_API_KEY', ''),
json={
'url': 'https://httpbin.org/anything',
'browserHtml': True,
'requestHeaders': {
'referer': 'https://example.org/',
},
},
)
browser_html = api_response.json()['browserHtml']
selector = Selector(browser_html)
response_json = selector.xpath('//text()').get()
response_data = json.loads(response_json)
headers = response_data['headers']
import asyncio
import json
from parsel import Selector
from zyte_api.aio.client import AsyncClient
async def main():
client = AsyncClient()
api_response = await client.request_raw(
{
'url': 'https://httpbin.org/anything',
'browserHtml': True,
'requestHeaders': {
'referer': 'https://example.org/',
},
}
)
browser_html = api_response['browserHtml']
selector = Selector(browser_html)
response_json = selector.xpath('//text()').get()
response_data = json.loads(response_json)
headers = response_data['headers']
asyncio.run(main())
import json
from scrapy import Request, Spider
class HTTPBinOrgSpider(Spider):
name = "httpbin_org"
def start_requests(self):
yield Request(
"https://httpbin.org/anything",
headers={"Referer": "https://example.org/"},
meta={
"zyte_api_automap": {
"browserHtml": True,
},
},
)
def parse(self, response):
response_json = response.xpath("//text()").get()
response_data = json.loads(response_json)
headers = response_data["headers"]
At the moment, only the Referer header can be overridden this way. If you
need to override additional headers, extract a response body instead, using its request header definition property (customHttpRequestHeaders
).
JavaScript#
When using browser automation, JavaScript execution is enabled by default for most websites.
For some websites, however, JavaScript execution is disabled by default because it helps data extraction.
You can set the javascript
key in your API request body to true
or
false
to force enabling or disabling JavaScript execution, regardless of
the default value for the target website.
Example
Note
Install and configure code example requirements to run the example below.
using System.Collections.Generic;
using System.Net;
using System.Net.Http;
using System.Text;
using System.Text.Json;
using System.Threading.Tasks;
using System.Xml.XPath;
using HtmlAgilityPack;
HttpClientHandler handler = new HttpClientHandler()
{
AutomaticDecompression = DecompressionMethods.All
};
HttpClient client = new HttpClient(handler);
var apiKey = "YOUR_API_KEY";
var bytes = Encoding.GetEncoding("ISO-8859-1").GetBytes(apiKey + ":");
var auth = System.Convert.ToBase64String(bytes);
client.DefaultRequestHeaders.Add("Authorization", "Basic " + auth);
client.DefaultRequestHeaders.Add("Accept-Encoding", "br, gzip, deflate");
var input = new Dictionary<string, object>(){
{"url", "https://www.whatismybrowser.com/detect/is-javascript-enabled"},
{"browserHtml", true},
{"javascript", false}
};
var inputJson = JsonSerializer.Serialize(input);
var content = new StringContent(inputJson, Encoding.UTF8, "application/json");
HttpResponseMessage response = await client.PostAsync("https://api.zyte.com/v1/extract", content);
var body = await response.Content.ReadAsByteArrayAsync();
var data = JsonDocument.Parse(body);
var browserHtml = data.RootElement.GetProperty("browserHtml").ToString();
var htmlDocument = new HtmlDocument();
htmlDocument.LoadHtml(browserHtml);
var navigator = htmlDocument.CreateNavigator();
var nodeIterator = (XPathNodeIterator)navigator.Evaluate("//*[@id=\"detected_value\"]/text()");
nodeIterator.MoveNext();
var isJavaScriptEnabled = nodeIterator.Current.ToString();
{
"url": "https://www.whatismybrowser.com/detect/is-javascript-enabled",
"browserHtml": true,
"javascript": false
}
curl \
--user YOUR_API_KEY: \
--header 'Content-Type: application/json' \
--data @input.json \
--compressed \
https://api.zyte.com/v1/extract \
| jq --raw-output .browserHtml \
| xmllint --html --xpath '//*[@id="detected_value"]/text()' - 2> /dev/null
{"url": "https://www.whatismybrowser.com/detect/is-javascript-enabled", "browserHtml": true, "javascript": false}
zyte-api input.jsonl 2> /dev/null \
| jq --raw-output .browserHtml \
| xmllint --html --xpath '//*[@id="detected_value"]/text()' - 2> /dev/null
import com.google.common.collect.ImmutableMap;
import com.google.gson.Gson;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Base64;
import java.util.Map;
import org.apache.hc.client5.http.classic.methods.HttpPost;
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
import org.apache.hc.client5.http.impl.classic.CloseableHttpResponse;
import org.apache.hc.client5.http.impl.classic.HttpClients;
import org.apache.hc.core5.http.ContentType;
import org.apache.hc.core5.http.HttpEntity;
import org.apache.hc.core5.http.HttpHeaders;
import org.apache.hc.core5.http.ParseException;
import org.apache.hc.core5.http.io.entity.EntityUtils;
import org.apache.hc.core5.http.io.entity.StringEntity;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
class Example {
private static final String API_KEY = "YOUR_API_KEY";
public static void main(final String[] args)
throws InterruptedException, IOException, ParseException {
Map<String, Object> parameters =
ImmutableMap.of(
"url",
"https://www.whatismybrowser.com/detect/is-javascript-enabled",
"browserHtml",
true,
"javascript",
false);
String requestBody = new Gson().toJson(parameters);
HttpPost request = new HttpPost("https://api.zyte.com/v1/extract");
request.setHeader(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON);
request.setHeader(HttpHeaders.ACCEPT_ENCODING, "gzip, deflate");
request.setHeader(HttpHeaders.AUTHORIZATION, buildAuthHeader());
request.setEntity(new StringEntity(requestBody));
try (CloseableHttpClient client = HttpClients.createDefault()) {
try (CloseableHttpResponse response = client.execute(request)) {
HttpEntity entity = response.getEntity();
String apiResponse = EntityUtils.toString(entity, StandardCharsets.UTF_8);
JsonObject jsonObject = JsonParser.parseString(apiResponse).getAsJsonObject();
String browserHtml = jsonObject.get("browserHtml").getAsString();
Document document = Jsoup.parse(browserHtml);
String isJavaScriptEnabled = document.select("#detected_value").text();
}
}
}
private static String buildAuthHeader() {
String auth = API_KEY + ":";
String encodedAuth = Base64.getEncoder().encodeToString(auth.getBytes());
return "Basic " + encodedAuth;
}
}
const axios = require('axios')
const cheerio = require('cheerio')
axios.post(
'https://api.zyte.com/v1/extract',
{
url: 'https://www.whatismybrowser.com/detect/is-javascript-enabled',
browserHtml: true,
javascript: false
},
{
auth: { username: 'YOUR_API_KEY' }
}
).then((response) => {
const $ = cheerio.load(response.data.browserHtml)
const isJavaScriptEnabled = $('#detected_value').text()
})
<?php
$client = new GuzzleHttp\Client();
$response = $client->request('POST', 'https://api.zyte.com/v1/extract', [
'auth' => ['YOUR_API_KEY', ''],
'headers' => ['Accept-Encoding' => 'gzip'],
'json' => [
'url' => 'https://www.whatismybrowser.com/detect/is-javascript-enabled',
'browserHtml' => true,
'javascript' => false,
],
]);
$api = json_decode($response->getBody());
$doc = new DOMDocument();
$doc->loadHTML($api->browserHtml);
$xpath = new DOMXPath($doc);
$is_javascript_enabled = $xpath->query("//*[@id='detected_value']")->item(0)->textContent;
import requests
from parsel import Selector
api_response = requests.post(
'https://api.zyte.com/v1/extract',
auth=('YOUR_API_KEY', ''),
json={
'url': 'https://www.whatismybrowser.com/detect/is-javascript-enabled',
'browserHtml': True,
'javascript': False,
},
)
browser_html = api_response.json()['browserHtml']
selector = Selector(browser_html)
is_javascript_enabled: str = selector.css('#detected_value::text').get()
import asyncio
from parsel import Selector
from zyte_api.aio.client import AsyncClient
async def main():
client = AsyncClient()
api_response = await client.request_raw(
{
'url': 'https://www.whatismybrowser.com/detect/is-javascript-enabled',
'browserHtml': True,
'javascript': False,
}
)
browser_html = api_response['browserHtml']
selector = Selector(browser_html)
is_javascript_enabled: str = selector.css('#detected_value::text').get()
asyncio.run(main())
from scrapy import Request, Spider
class HTTPBinOrgSpider(Spider):
name = "httpbin_org"
def start_requests(self):
yield Request(
"https://www.whatismybrowser.com/detect/is-javascript-enabled",
meta={
"zyte_api_automap": {
"browserHtml": True,
"javascript": False,
},
},
)
def parse(self, response):
is_javascript_enabled: str = response.css("#detected_value::text").get()