Using Smart Proxy Manager with Java

Warning

zyte-smartproxy-ca.crt should be installed in your OS for the below code to work. You can follow these instructions in order to install it.

Note

Because of HTTPCLIENT-1649 you should use version 4.5 or later of HttpComponents Client.

Note

All the code in this documentation has been tested with Java 17, HttpComponents 4.5 and Jsoup 1.14.3.

Using Apache HttpComponents

This is how the The Apache HttpComponents example looks with Smart Proxy Manager support:

import java.io.File;
import javax.net.ssl.SSLContext;
import org.apache.http.HttpHeaders;
import org.apache.http.HttpHost;
import org.apache.http.auth.AuthScope;
import org.apache.http.auth.UsernamePasswordCredentials;
import org.apache.http.client.AuthCache;
import org.apache.http.client.CredentialsProvider;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.protocol.HttpClientContext;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.conn.ssl.TrustSelfSignedStrategy;
import org.apache.http.impl.auth.BasicScheme;
import org.apache.http.impl.client.BasicAuthCache;
import org.apache.http.impl.client.BasicCredentialsProvider;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicHeader;
import org.apache.http.ssl.SSLContexts;
import org.apache.http.util.EntityUtils;

    public class ClientProxyAuthentication {

        public static void main(String[] args) throws Exception {

            // Trust own CA and all self-signed certs
            SSLContext sslcontext = SSLContexts.custom()
                    .loadTrustMaterial(new File("/path/to/jre/lib/security/cacerts"),
                                       "changeit".toCharArray(),
                                       new TrustSelfSignedStrategy())
                    .build();

            // Allow TLSv1.2 protocol only
            SSLConnectionSocketFactory sslsf = new SSLConnectionSocketFactory(
                    sslcontext, new String[] {"TLSv1.2"},
                    null,
                    SSLConnectionSocketFactory.getDefaultHostnameVerifier());

            CredentialsProvider credsProvider = new BasicCredentialsProvider();
            credsProvider.setCredentials(
                    new AuthScope("proxy.zyte.com", 8011),
                    new UsernamePasswordCredentials("<API KEY>", ""));

            try (CloseableHttpClient httpclient = HttpClients.custom()
                    .setDefaultCredentialsProvider(credsProvider)
                    .setSSLSocketFactory(sslsf)
                    .build())
            {
                HttpHost target = new HttpHost("toscrape.com", 443, "https");
                HttpHost proxy = new HttpHost("proxy.zyte.com", 8011);

                AuthCache authCache = new BasicAuthCache();

                BasicScheme basicAuth = new BasicScheme();
                basicAuth.processChallenge(
                        new BasicHeader(HttpHeaders.PROXY_AUTHENTICATE,
                                        "Basic realm=\"Crawlera\""));
                authCache.put(proxy, basicAuth);

                HttpClientContext ctx = HttpClientContext.create();
                ctx.setAuthCache(authCache);

                RequestConfig config = RequestConfig.custom()
                    .setProxy(proxy)
                    .build();

                HttpGet httpget = new HttpGet("/");
                httpget.setConfig(config);

                System.out.println("Executing request " + httpget.getRequestLine() +
                    " to " + target + " via " + proxy);

                try (CloseableHttpResponse response = httpclient.execute(
                    target, httpget, ctx))
                {
                    System.out.println("----------------------------------------");
                    System.out.println(response.getStatusLine());
                    System.out.println("----------------------------------------");
                    System.out.println(EntityUtils.toString(response.getEntity()));
                    EntityUtils.consume(response.getEntity());
                }
            }
        }
    }

zyte-smartproxy-ca.crt should be added to keystore, for instance with keytool:

keytool -import -file /path/to/zyte-smartproxy-ca.crt -storepass changeit -keystore $JAVA_HOME/jre/lib/security/cacerts -alias crawleracert

Warning

Some HTTP client libraries including Apache HttpComponents Client and .NET don’t send authentication headers by default. This can result in doubled requests so pre-emptive authentication should be enabled where this is the case. In the above example, we are making HTTPS requests to https://toscrape.com through Smart Proxy Manager. It is assumed that the Smart Proxy Manager certificate has been installed since CONNECT method will be employed.

Using Jsoup

Here’s an example of Smart Proxy Manager integration with Jsoup. In this example we read a list of URLs from the file urls.txt and crawl them parallelly.

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.io.IOException;
import java.util.Base64;
import java.util.List;

public class WebScraping {

    public static void main(String[] args) throws Exception {

        String authString = "<API KEY>:";
        String encodedAuthString = Base64.getEncoder().encodeToString(authString.getBytes());

        final List<String> urls = Files.readAllLines(Paths.get(".", "urls.txt"));

        urls.parallelStream().forEach(url -> {
            try {
                final Document doc = Jsoup.connect(url)
                    .header("Proxy-Authorization", "Basic " + encodedAuthString)
                    .followRedirects(true)
                    .ignoreHttpErrors(true)
                    .ignoreContentType(true)
                    .timeout(180000)
                    .proxy("proxy.zyte.com", 8011)
                    .get();
                final String title = doc.select("title").text();
                System.out.println(Thread.currentThread().getName() + ": " + title);
            } catch (IOException e) {
                e.printStackTrace();
            }
        });
    }
}