Integrations

Python Requests integration

Here is a code example that illustrates how to use Smart Proxy Manager with Python Requests library:

import requests

url = "http://httpbin.org/ip"
proxy_host = "proxy.crawlera.com"
proxy_port = "8011"
proxy_auth = "<Smart Proxy Manager API KEY>:" # Make sure to include ':' at the end
proxies = {"https": "https://{}@{}:{}/".format(proxy_auth, proxy_host, proxy_port),
      "http": "http://{}@{}:{}/".format(proxy_auth, proxy_host, proxy_port)}

r = requests.get(url, proxies=proxies,
                 verify=False)

Note

This code uses Requests version 2.18. Using previous versions can lead to authentication 407 errors, hence ensure Requests version is at least 2.18.

Scrapy integration

The recommended way to use Smart Proxy Manager with Scrapy is by using the Zyte proxy middleware which can be installed with:

pip install scrapy-crawlera

You can enable the middleware by adding the following settings to your Scrapy project:

DOWNLOADER_MIDDLEWARES = {'scrapy_crawlera.CrawleraMiddleware': 610}
CRAWLERA_ENABLED = True
CRAWLERA_APIKEY = '<API key>'

See the scrapy-crawlera documentation for more information.

Scrapy Cloud integration

To employ Smart Proxy Manager in Scrapy Cloud projects the Crawlera addon is used. Go to Settings > Addons > Crawlera to activate.

Settings

CRAWLERA_URL

proxy URL (default: http://proxy.crawlera.com:8011)

CRAWLERA_ENABLED

tick the checkbox to enable Smart Proxy Manager

CRAWLERA_APIKEY

Smart Proxy Manager API key

CRAWLERA_MAXBANS

number of bans to ignore before closing the spider (default: 20)

CRAWLERA_DOWNLOAD_TIMEOUT

timeout for requests (default: 190)

Java integration

Note

Because of HTTPCLIENT-1649 you should use version 4.5 or later of HttpComponents Client.

This is how the The Apache HttpComponents example looks with Smart Proxy Manager support:

import java.io.File;
import javax.net.ssl.SSLContext;
import org.apache.http.HttpHeaders;
import org.apache.http.HttpHost;
import org.apache.http.auth.AuthScope;
import org.apache.http.auth.UsernamePasswordCredentials;
import org.apache.http.client.AuthCache;
import org.apache.http.client.CredentialsProvider;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.protocol.HttpClientContext;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.conn.ssl.TrustSelfSignedStrategy;
import org.apache.http.impl.auth.BasicScheme;
import org.apache.http.impl.client.BasicAuthCache;
import org.apache.http.impl.client.BasicCredentialsProvider;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicHeader;
import org.apache.http.ssl.SSLContexts;
import org.apache.http.util.EntityUtils;

    public class ClientProxyAuthentication {

        public static void main(String[] args) throws Exception {

            // Trust own CA and all self-signed certs
            SSLContext sslcontext = SSLContexts.custom()
                    .loadTrustMaterial(new File("/path/to/jre/lib/security/cacerts"),
                                       "changeit".toCharArray(),
                                       new TrustSelfSignedStrategy())
                    .build();

            // Allow TLSv1.2 protocol only
            SSLConnectionSocketFactory sslsf = new SSLConnectionSocketFactory(
                    sslcontext, new String[] {"TLSv1.2"},
                    null,
                    SSLConnectionSocketFactory.getDefaultHostnameVerifier());

            CredentialsProvider credsProvider = new BasicCredentialsProvider();
            credsProvider.setCredentials(
                    new AuthScope("proxy.crawlera.com", 8011),
                    new UsernamePasswordCredentials("<API KEY>", ""));

            try (CloseableHttpClient httpclient = HttpClients.custom()
                    .setDefaultCredentialsProvider(credsProvider)
                    .setSSLSocketFactory(sslsf)
                    .build())
            {
                HttpHost target = new HttpHost("twitter.com", 443, "https");
                HttpHost proxy = new HttpHost("proxy.crawlera.com", 8011);

                AuthCache authCache = new BasicAuthCache();

                BasicScheme basicAuth = new BasicScheme();
                basicAuth.processChallenge(
                        new BasicHeader(HttpHeaders.PROXY_AUTHENTICATE,
                                        "Basic realm=\"Crawlera\""));
                authCache.put(proxy, basicAuth);

                HttpClientContext ctx = HttpClientContext.create();
                ctx.setAuthCache(authCache);

                RequestConfig config = RequestConfig.custom()
                    .setProxy(proxy)
                    .build();

                HttpGet httpget = new HttpGet("/");
                httpget.setConfig(config);

                System.out.println("Executing request " + httpget.getRequestLine() +
                    " to " + target + " via " + proxy);

                try (CloseableHttpResponse response = httpclient.execute(
                    target, httpget, ctx))
                {
                    System.out.println("----------------------------------------");
                    System.out.println(response.getStatusLine());
                    System.out.println("----------------------------------------");
                    System.out.println(EntityUtils.toString(response.getEntity()));
                    EntityUtils.consume(response.getEntity());
                }
            }
        }
    }

zyte-proxy-ca.crt should be added to keystore, for instance with keytool:

keytool -import -file /path/to/zyte-proxy-ca.crt -storepass changeit -keystore $JAVA_HOME/jre/lib/security/cacerts -alias crawleracert

Warning

Some HTTP client libraries including Apache HttpComponents Client and .NET don’t send authentication headers by default. This can result in doubled requests so pre-emptive authentication should be enabled where this is the case. In the above example we are making HTTPS requests to https://twitter.com through Smart Proxy Manager. It is assumed that Smart Proxy Manager certificate has been installed, since CONNECT method will be employed.

Node.js integration

Making use of postman-request:

var request = require('postman-request');
var fs = require('fs');


var proxyRequest = request.defaults({
    'proxy': 'http://<API KEY>:@proxy.crawlera.com:8011'
});

var options = {
    url: 'https://twitter.com',
    ca: fs.readFileSync("/path/to/zyte-proxy-ca.crt"),
    requestCert: true,
    rejectUnauthorized: true
};

function callback(error, response, body) {
    if (!error && response.statusCode == 200) {
        console.log(response.headers);
        console.log(body);
    }
    else{
        console.log(error, response, body);
    }
}

proxyRequest(options, callback);

Smart Proxy Manager Certificate Authority can be downloaded here: zyte-proxy-ca.crt.

C# integration

using System;
using System.IO;
using System.Net;

namespace ProxyRequest
{
    class MainClass
    {
        public static void Main(string[] args)
        {
            var myProxy = new WebProxy("http://proxy.crawlera.com:8011", true);
            myProxy.Credentials = new NetworkCredential("<CRAWLERA_APIKEY>", "");

            var request = (HttpWebRequest)WebRequest.Create("https://httpbin.scrapinghub.com/headers");
            request.Proxy = myProxy;
            request.PreAuthenticate = true;
            request.AllowAutoRedirect = false;
            request.ServerCertificateValidationCallback += (sender, certificate, chain, sslPolicyErrors) => true;

            var response = request.GetResponse();
            Console.WriteLine("Response Status: " + ((HttpWebResponse)response).StatusDescription);
            Console.WriteLine("\nResponse Headers:\n" + ((HttpWebResponse)response).Headers);
            var dataStream = response.GetResponseStream();
            var reader = new StreamReader(dataStream);
            string responseFromServer = reader.ReadToEnd();
            Console.WriteLine("Response Body:\n" + responseFromServer);
            reader.Close();
            response.Close();
        }
    }
}

Another approach, with the Proxy-Authorization header:

using System;
using System.IO;
using System.Net;

namespace ProxyRequest
{
    class MainClass
    {
        public static void Main (string[] args)
        {
            var myProxy = new WebProxy("http://proxy.crawlera.com:8011");
            string apiKey = "<CRAWLERA_APIKEY>:"; // Note the ":" sign at the end
            var encodedApiKey = Base64Encode(apiKey);

            var request = (HttpWebRequest)WebRequest.Create("https://httpbin.org/ip");
            request.Headers.Add("Proxy-Authorization", "Basic " + encodedApiKey);
            request.Proxy = myProxy;
            request.PreAuthenticate = true;
            request.AllowAutoRedirect = false;
            request.ServerCertificateValidationCallback += (sender, certificate, chain, sslPolicyErrors) => true;

            var response = request.GetResponse();
            Console.WriteLine("Response Status: " + ((HttpWebResponse)response).StatusDescription);
            Console.WriteLine("\nResponse Headers:\n" + ((HttpWebResponse)response).Headers);
            var dataStream = response.GetResponseStream();
            var reader = new StreamReader(dataStream);
            string responseFromServer = reader.ReadToEnd();
            Console.WriteLine("Response Body:\n" + responseFromServer);
            reader.Close();
            response.Close();
        }

        public static string Base64Encode(string apiKey)
        {
            var plainTextBytes = System.Text.Encoding.UTF8.GetBytes(apiKey);
            return System.Convert.ToBase64String(plainTextBytes);
        }
    }
}

Warning

Some HTTP client libraries, including Apache HttpComponents Client and .NET, don’t send authentication headers by default. This can result in doubled requests, so preemptive authentication should be enabled where this is the case.

In the above example we are making an HTTPS request through Smart Proxy Manager. It is assumed that the Zyte proxy SSL Certificate has been installed, since the CONNECT method will be employed.

If you use WebClient and receive 407s from Smart Proxy Manager, try setting AllowAutoRedirect to false.

Ruby integration

Using Curb

Making use of curb, a Ruby binding for libcurl:

require 'curb'

url = "https://twitter.com"
proxy = "proxy.crawlera.com:8011"
proxy_auth = "<CRAWLERA_APIKEY>:"

c = Curl::Easy.new(url) do |curl|
  curl.proxypwd = proxy_auth
  curl.proxy_url = proxy
  curl.verbose = true
end

c.perform
puts c.body_str

Using Typhoeus

Making use of typhoeus, another Ruby binding for libcurl:

require 'typhoeus'

url = "https://twitter.com"
proxy_host = "proxy.crawlera.com:8011"
proxy_auth = "<CRAWLERA_APIKEY>:"

request = Typhoeus::Request.new(
  url,
  method: :get,
  proxy: proxy_host,
  proxyuserpwd: proxy_auth,
  headers: {"X-Crawlera-Timeout" => "60000"}
)

request.run
print "Response Code: "
puts request.response.code
print "Response Time: "
puts request.response.total_time
print "Response Headers: "
puts request.response.headers
print "Response Body: "
puts request.response.body

Using Mechanize

Making use of mechanize, a Ruby library for automated web interaction: Don’t forget to load the Certificate file zyte-proxy-ca.crt, and set it using the env variable export SSL_CERT_FILE=/path/to/zyte-proxy-ca.crt

require 'rubygems'
require 'mechanize'

url = "https://twitter.com"
proxy_host = "proxy.crawlera.com"
proxy_api_key = "<CRAWLERA_APIKEY>"

agent = Mechanize.new
agent.set_proxy proxy_host, 8011, proxy_api_key, ''

res = agent.get(url)
puts res.body

PHP integration

Using Curl

Making use of PHP binding for libcurl:

<?php

$ch = curl_init();

$url = 'https://httpbin.scrapinghub.com/get';
$proxy = 'proxy.crawlera.com:8011';
$proxy_auth = '<API KEY>:';

curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_PROXY, $proxy);
curl_setopt($ch, CURLOPT_PROXYUSERPWD, $proxy_auth);
curl_setopt($ch, CURLOPT_HEADER, 1);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 30);
curl_setopt($ch, CURLOPT_TIMEOUT, 180);
curl_setopt($ch, CURLOPT_CAINFO, '/path/to/zyte-proxy-ca.crt'); //required for HTTPS
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, 1); //required for HTTPS

$scraped_page = curl_exec($ch);

if($scraped_page === false)
{
    echo 'cURL error: ' . curl_error($ch);
}
else
{
    echo $scraped_page;
}

curl_close($ch);

?>

Please be sure to download the certificate provided in your Smart Proxy Manager account and set the correct path to the file in your script.

Refer to curl_multi_exec function to take advantage of Smart Proxy Manager’s concurrency feature and process requests in parallel (within the limits set for a given Smart Proxy Manager plan).

Using Guzzle

A Guzzle example:

<?php

use GuzzleHttp\Client as GuzzleClient;

$proxy_host = 'proxy.crawlera.com';
$proxy_port = '8011';
$proxy_user = '<API KEY>';
$proxy_pass = '';
$proxy_url = "http://{$proxy_user}:{$proxy_pass}@{$proxy_host}:{$proxy_port}";

$url = 'https://httpbin.org/headers';

$guzzle_client = new GuzzleClient();
$res = $guzzle_client->request('GET', $url, [
    'proxy' => $proxy_url,
    'headers' => [
        'X-Crawlera-Cookies' => 'disable',
        'Accept-Encoding' => 'gzip, deflate, br',
    ]
]);

echo $res->getBody();

?>