Warning
Zyte API is replacing Smart Proxy Manager. It is no longer possible to sign up to Smart Proxy Manager. If you are an existing Smart Proxy Manager user, see Migrating from Smart Proxy Manager to Zyte API.
Using Smart Proxy Manager with Java#
Note
Because of HTTPCLIENT-1649 you should use version 4.5 or later of HttpComponents Client.
Note
All the code in this documentation has been tested with Java 17, HttpComponents 4.5 and Jsoup 1.14.3.
Using Apache HttpComponents#
This is how the The Apache HttpComponents example looks with Smart Proxy Manager support:
import java.io.File;
import javax.net.ssl.SSLContext;
import org.apache.http.HttpHeaders;
import org.apache.http.HttpHost;
import org.apache.http.auth.AuthScope;
import org.apache.http.auth.UsernamePasswordCredentials;
import org.apache.http.client.AuthCache;
import org.apache.http.client.CredentialsProvider;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.protocol.HttpClientContext;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.conn.ssl.TrustSelfSignedStrategy;
import org.apache.http.impl.auth.BasicScheme;
import org.apache.http.impl.client.BasicAuthCache;
import org.apache.http.impl.client.BasicCredentialsProvider;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicHeader;
import org.apache.http.ssl.SSLContexts;
import org.apache.http.util.EntityUtils;
public class ClientProxyAuthentication {
public static void main(String[] args) throws Exception {
// Trust own CA and all self-signed certs
SSLContext sslcontext = SSLContexts.custom()
.loadTrustMaterial(new File("/path/to/jre/lib/security/cacerts"),
"changeit".toCharArray(),
new TrustSelfSignedStrategy())
.build();
// Allow TLSv1.2 protocol only
SSLConnectionSocketFactory sslsf = new SSLConnectionSocketFactory(
sslcontext, new String[] {"TLSv1.2"},
null,
SSLConnectionSocketFactory.getDefaultHostnameVerifier());
CredentialsProvider credsProvider = new BasicCredentialsProvider();
credsProvider.setCredentials(
new AuthScope("proxy.zyte.com", 8011),
new UsernamePasswordCredentials("<API KEY>", ""));
try (CloseableHttpClient httpclient = HttpClients.custom()
.setDefaultCredentialsProvider(credsProvider)
.setSSLSocketFactory(sslsf)
.build())
{
HttpHost target = new HttpHost("toscrape.com", 443, "https");
HttpHost proxy = new HttpHost("proxy.zyte.com", 8011);
AuthCache authCache = new BasicAuthCache();
BasicScheme basicAuth = new BasicScheme();
basicAuth.processChallenge(
new BasicHeader(HttpHeaders.PROXY_AUTHENTICATE,
"Basic realm=\"Crawlera\""));
authCache.put(proxy, basicAuth);
HttpClientContext ctx = HttpClientContext.create();
ctx.setAuthCache(authCache);
RequestConfig config = RequestConfig.custom()
.setProxy(proxy)
.build();
HttpGet httpget = new HttpGet("/");
httpget.setConfig(config);
System.out.println("Executing request " + httpget.getRequestLine() +
" to " + target + " via " + proxy);
try (CloseableHttpResponse response = httpclient.execute(
target, httpget, ctx))
{
System.out.println("----------------------------------------");
System.out.println(response.getStatusLine());
System.out.println("----------------------------------------");
System.out.println(EntityUtils.toString(response.getEntity()));
EntityUtils.consume(response.getEntity());
}
}
}
}
You must download the Zyte CA certificate and add it to keystore, for instance with keytool:
keytool -import -file /path/to/zyte-ca.crt -storepass changeit -keystore $JAVA_HOME/jre/lib/security/cacerts -alias zytecert
Warning
Some HTTP client libraries including Apache HttpComponents Client and .NET don’t send authentication headers by default. This can result in doubled requests so pre-emptive authentication should be enabled where this is the case. In the above example, we are making HTTPS requests to https://toscrape.com through Smart Proxy Manager. It is assumed that the Smart Proxy Manager certificate has been installed since CONNECT method will be employed. For more information check out Fetching HTTPS pages with Zyte Smart Proxy Manager.
Using Jsoup#
Here’s an example of Smart Proxy Manager integration with Jsoup. In this
example we read a list of URLs from the file urls.txt
and crawl them parallelly.
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.io.IOException;
import java.util.Base64;
import java.util.List;
public class WebScraping {
public static void main(String[] args) throws Exception {
String authString = "<API KEY>:";
String encodedAuthString = Base64.getEncoder().encodeToString(authString.getBytes());
final List<String> urls = Files.readAllLines(Paths.get(".", "urls.txt"));
urls.parallelStream().forEach(url -> {
try {
final Document doc = Jsoup.connect(url)
.header("Proxy-Authorization", "Basic " + encodedAuthString)
.followRedirects(true)
.ignoreHttpErrors(true)
.ignoreContentType(true)
.timeout(180000)
.proxy("proxy.zyte.com", 8011)
.get();
final String title = doc.select("title").text();
System.out.println(Thread.currentThread().getName() + ": " + title);
} catch (IOException e) {
e.printStackTrace();
}
});
}
}