Zyte API automatic extraction#

To let Zyte API parse the contents of the target URL automatically and give you structured data, enable one of the automatic extraction request fields. Their schemas are covered in the Zyte API reference. Each schema is a subset of the latest version of its matching Zyte Data schema.

Example

Note

Install and configure code example requirements to run the example below.

using System;
using System.Collections.Generic;
using System.Net;
using System.Net.Http;
using System.Text;
using System.Text.Json;
using System.Threading.Tasks;

HttpClientHandler handler = new HttpClientHandler()
{
    AutomaticDecompression = DecompressionMethods.All
};
HttpClient client = new HttpClient(handler);

var apiKey = "YOUR_API_KEY";
var bytes = Encoding.GetEncoding("ISO-8859-1").GetBytes(apiKey + ":");
var auth = System.Convert.ToBase64String(bytes);
client.DefaultRequestHeaders.Add("Authorization", "Basic " + auth);

client.DefaultRequestHeaders.Add("Accept-Encoding", "br, gzip, deflate");

var input = new Dictionary<string, object>(){
    {"url", "https://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html"},
    {"product", true}
};
var inputJson = JsonSerializer.Serialize(input);
var content = new StringContent(inputJson, Encoding.UTF8, "application/json");

HttpResponseMessage response = await client.PostAsync("https://api.zyte.com/v1/extract", content);
var body = await response.Content.ReadAsByteArrayAsync();

var data = JsonDocument.Parse(body);
var product = data.RootElement.GetProperty("product").ToString();

Console.WriteLine(product);
input.jsonl#
{"url": "https://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html", "product": true}
zyte-api input.jsonl \
    | jq --raw-output .product
input.json#
{
    "url": "https://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html",
    "product": true
}
curl \
    --user YOUR_API_KEY: \
    --header 'Content-Type: application/json' \
    --data @input.json \
    --compressed \
    https://api.zyte.com/v1/extract \
    | jq --raw-output .product
import com.google.common.collect.ImmutableMap;
import com.google.gson.Gson;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Base64;
import java.util.Map;
import org.apache.hc.client5.http.classic.methods.HttpPost;
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
import org.apache.hc.client5.http.impl.classic.CloseableHttpResponse;
import org.apache.hc.client5.http.impl.classic.HttpClients;
import org.apache.hc.core5.http.ContentType;
import org.apache.hc.core5.http.HttpEntity;
import org.apache.hc.core5.http.HttpHeaders;
import org.apache.hc.core5.http.ParseException;
import org.apache.hc.core5.http.io.entity.EntityUtils;
import org.apache.hc.core5.http.io.entity.StringEntity;

class Example {
  private static final String API_KEY = "YOUR_API_KEY";

  public static void main(final String[] args)
      throws InterruptedException, IOException, ParseException {
    Map<String, Object> parameters =
        ImmutableMap.of(
            "url",
            "https://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html",
            "product",
            true);
    String requestBody = new Gson().toJson(parameters);

    HttpPost request = new HttpPost("https://api.zyte.com/v1/extract");
    request.setHeader(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON);
    request.setHeader(HttpHeaders.ACCEPT_ENCODING, "gzip, deflate");
    request.setHeader(HttpHeaders.AUTHORIZATION, buildAuthHeader());
    request.setEntity(new StringEntity(requestBody));

    try (CloseableHttpClient client = HttpClients.createDefault()) {
      try (CloseableHttpResponse response = client.execute(request)) {
        HttpEntity entity = response.getEntity();
        String apiResponse = EntityUtils.toString(entity, StandardCharsets.UTF_8);
        JsonObject jsonObject = JsonParser.parseString(apiResponse).getAsJsonObject();
        String product = jsonObject.get("product").toString();
        System.out.println(product);
      }
    }
  }

  private static String buildAuthHeader() {
    String auth = API_KEY + ":";
    String encodedAuth = Base64.getEncoder().encodeToString(auth.getBytes());
    return "Basic " + encodedAuth;
  }
}
const axios = require('axios')

axios.post(
  'https://api.zyte.com/v1/extract',
  {
    url: 'https://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html',
    product: true
  },
  {
    auth: { username: 'YOUR_API_KEY' }
  }
).then((response) => {
  const product = response.data.product
  console.log(product)
})
<?php

$client = new GuzzleHttp\Client();
$response = $client->request('POST', 'https://api.zyte.com/v1/extract', [
    'auth' => ['YOUR_API_KEY', ''],
    'headers' => ['Accept-Encoding' => 'gzip'],
    'json' => [
        'url' => 'https://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html',
        'product' => true,
    ],
]);
$data = json_decode($response->getBody());
$product = json_encode($data->product);
echo $product.PHP_EOL;
import requests

api_response = requests.post(
    "https://api.zyte.com/v1/extract",
    auth=("YOUR_API_KEY", ""),
    json={
        "url": (
            "https://books.toscrape.com/catalogue"
            "/a-light-in-the-attic_1000/index.html"
        ),
        "product": True,
    },
)
product = api_response.json()["product"]
print(product)
import asyncio

from zyte_api.aio.client import AsyncClient


async def main():
    client = AsyncClient()
    api_response = await client.request_raw(
        {
            "url": (
                "https://books.toscrape.com/catalogue"
                "/a-light-in-the-attic_1000/index.html"
            ),
            "product": True,
        }
    )
    product = api_response["product"]
    print(product)


asyncio.run(main())
from scrapy import Request, Spider


class BooksToScrapeComSpider(Spider):
    name = "books_toscrape_com"

    def start_requests(self):
        yield Request(
            (
                "https://books.toscrape.com/catalogue"
                "/a-light-in-the-attic_1000/index.html"
            ),
            meta={
                "zyte_api_automap": {
                    "product": True,
                },
            },
        )

    def parse(self, response):
        product = response.raw_api_response["product"]
        print(product)

Output (first 5 lines):

{
  "name": "A Light in the Attic",
  "price": "51.77",
  "currency": "GBP",
  "currencyRaw": "£",

Automatic extraction supports geolocation, cookies, redirection, response headers, and metadata, plus additional features depending on your extraction source.

You can only enable 1 automatic extraction property per Zyte API request.

Extraction source#

Automatic extraction can be performed using either a browser request or an HTTP request. Choose which using the corresponding extractFrom option, e.g. productOptions.extractFrom when extracting a product.

Note

Automatic extraction from an HTTP request is currently in Beta. API usage is fully functional, but the feature is not yet available in the Zyte dashboard.

Currently, automatic extraction defaults to using a browser request. In the future, however, the default value may depend on the target website.

Automatic extraction using an HTTP request supports HTTP request attributes for method, body, and headers.

Automatic extraction using a browser request supports browser HTML, screenshots, some request headers, actions, and toggling JavaScript. The limitations of browser requests also apply in this case.

When deciding whether to use automatic extraction from a browser request or from an HTTP request, consider the following:

  • Extraction using an HTTP request is typically much faster and has a lower cost compared to extraction from a browser request.

  • For some websites, extraction from an HTTP request would produce extremely poor results (such as low probability and missing fields), which often happens when JavaScript execution is required.

  • It is helpful to test both methods and choose extraction from a browser request if it provides better quality.

Request fields#

Enable any of the following request fields to get matching structured data: