PK s[AV_A43 3 phpunit.xml.distnu W+A
./Goutte/Tests
PK s[AVT) ) LICENSEnu W+A Copyright (c) 2010-2013 Fabien Potencier
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is furnished
to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
PK s[AV8迏) ) Goutte/Tests/ClientTest.phpnu W+A
*
* For the full copyright and license information, please view the LICENSE
* file that was distributed with this source code.
*/
namespace Goutte\Tests;
use Goutte\Client;
use GuzzleHttp\Client as GuzzleClient;
use GuzzleHttp\Exception\RequestException;
use GuzzleHttp\Message\Response as GuzzleResponse;
use GuzzleHttp\Stream\Stream;
use GuzzleHttp\Subscriber\History;
use GuzzleHttp\Subscriber\Mock;
use GuzzleHttp\Post\PostFile;
use Symfony\Component\BrowserKit\Cookie;
/**
* Goutte Client Test
*
* @author Michael Dowling
*/
class ClientTest extends \PHPUnit_Framework_TestCase
{
protected $history;
protected $mock;
protected function getGuzzle()
{
$this->history = new History();
$this->mock = new Mock();
$this->mock->addResponse(new GuzzleResponse(200, array(), Stream::factory('Hi
')));
$guzzle = new GuzzleClient(array('redirect.disable' => true, 'base_url' => ''));
$guzzle->getEmitter()->attach($this->mock);
$guzzle->getEmitter()->attach($this->history);
return $guzzle;
}
public function testCreatesDefaultClient()
{
$client = new Client();
$this->assertInstanceOf('GuzzleHttp\\ClientInterface', $client->getClient());
}
public function testUsesCustomClient()
{
$guzzle = new GuzzleClient();
$client = new Client();
$this->assertSame($client, $client->setClient($guzzle));
$this->assertSame($guzzle, $client->getClient());
}
public function testUsesCustomHeaders()
{
$guzzle = $this->getGuzzle();
$client = new Client();
$client->setClient($guzzle);
$client->setHeader('X-Test', 'test');
$crawler = $client->request('GET', 'http://www.example.com/');
$this->assertEquals('test', $this->history->getLastRequest()->getHeader('X-Test'));
}
public function testCustomUserAgent()
{
$guzzle = $this->getGuzzle();
$client = new Client();
$client->setClient($guzzle);
$client->setHeader('User-Agent', 'foo');
$crawler = $client->request('GET', 'http://www.example.com/');
$this->assertEquals('foo', $this->history->getLastRequest()->getHeader('User-Agent'));
}
public function testUsesAuth()
{
$guzzle = $this->getGuzzle();
$client = new Client();
$client->setClient($guzzle);
$client->setAuth('me', '**');
$crawler = $client->request('GET', 'http://www.example.com/');
$request = $this->history->getLastRequest();
$this->assertEquals('me', $request->getConfig()->get('auth')[0]);
$this->assertEquals('**', $request->getConfig()->get('auth')[1]);
$this->assertEquals('basic', $request->getConfig()->get('auth')[2]);
}
public function testResetsAuth()
{
$guzzle = $this->getGuzzle();
$client = new Client();
$client->setClient($guzzle);
$client->setAuth('me', '**');
$client->resetAuth();
$crawler = $client->request('GET', 'http://www.example.com/');
$request = $this->history->getLastRequest();
$this->assertNull($request->getConfig()->get('auth')[0]);
$this->assertNull($request->getConfig()->get('auth')[1]);
}
public function testUsesCookies()
{
$guzzle = $this->getGuzzle();
$client = new Client();
$client->setClient($guzzle);
$client->getCookieJar()->set(new Cookie('test', '123'));
$crawler = $client->request('GET', 'http://www.example.com/');
$request = $this->history->getLastRequest();
$this->assertEquals('test=123', $request->getHeader('Cookie'));
}
public function testUsesPostFiles()
{
$guzzle = $this->getGuzzle();
$client = new Client();
$client->setClient($guzzle);
$files = array(
'test' => array(
'name' => 'test.txt',
'tmp_name' => __FILE__
)
);
$crawler = $client->request('POST', 'http://www.example.com/', array(), $files);
$request = $this->history->getLastRequest();
$files = $request->getBody()->getFiles();
$this->assertFile(reset($files), 'test', __FILE__, array(
'Content-Type' => 'text/x-php',
'Content-Disposition' => 'form-data; filename="ClientTest.php"; name="test"',
));
}
public function testUsesPostNamedFiles()
{
$guzzle = $this->getGuzzle();
$client = new Client();
$client->setClient($guzzle);
$files = array(
'test' => __FILE__
);
$crawler = $client->request('POST', 'http://www.example.com/', array(), $files);
$request = $this->history->getLastRequest();
$files = $request->getBody()->getFiles();
$this->assertFile(reset($files), 'test', __FILE__, array(
'Content-Type' => 'text/x-php',
'Content-Disposition' => 'form-data; filename="ClientTest.php"; name="test"',
));
}
public function testUsesPostFilesNestedFields()
{
$guzzle = $this->getGuzzle();
$client = new Client();
$client->setClient($guzzle);
$files = array(
'form' => array(
'test' => array(
'name' => 'test.txt',
'tmp_name' => __FILE__
),
),
);
$crawler = $client->request('POST', 'http://www.example.com/', array(), $files);
$request = $this->history->getLastRequest();
$files = $request->getBody()->getFiles();
$this->assertFile(reset($files), 'form[test]', __FILE__, array(
'Content-Type' => 'text/x-php',
'Content-Disposition' => 'form-data; filename="ClientTest.php"; name="form[test]"',
));
}
public function testUsesPostFilesOnClientSide()
{
$guzzle = $this->getGuzzle();
$client = new Client();
$client->setClient($guzzle);
$files = array(
'test' => __FILE__,
);
$crawler = $client->request('POST', 'http://www.example.com/', array(), $files);
$request = $this->history->getLastRequest();
$files = $request->getBody()->getFiles();
$this->assertFile(reset($files), 'test', __FILE__, array(
'Content-Type' => 'text/x-php',
'Content-Disposition' => 'form-data; filename="ClientTest.php"; name="test"',
));
}
public function testUsesPostFilesUploadError()
{
$guzzle = $this->getGuzzle();
$client = new Client();
$client->setClient($guzzle);
$files = array(
'test' => array(
'name' => '',
'type' => '',
'tmp_name' => '',
'error' => 4,
'size' => 0,
),
);
$crawler = $client->request('POST', 'http://www.example.com/', array(), $files);
$request = $this->history->getLastRequest();
$this->assertEquals(array(), $request->getBody()->getFiles());
}
public function testCreatesResponse()
{
$guzzle = $this->getGuzzle();
$client = new Client();
$client->setClient($guzzle);
$crawler = $client->request('GET', 'http://www.example.com/');
$this->assertEquals('Hi', $crawler->filter('p')->text());
}
public function testHandlesRedirectsCorrectly()
{
$guzzle = $this->getGuzzle();
$this->mock->clearQueue();
$this->mock->addResponse(new GuzzleResponse(301, array(
'Location' => 'http://www.example.com/'
)));
$this->mock->addResponse(new GuzzleResponse(200, [], Stream::factory('Test
')));
$client = new Client();
$client->setClient($guzzle);
$crawler = $client->request('GET', 'http://www.example.com/');
$this->assertEquals('Test', $crawler->filter('p')->text());
// Ensure that two requests were sent
$this->assertEquals(2, count($this->history));
}
public function testConvertsGuzzleHeadersToArrays()
{
$guzzle = $this->getGuzzle();
$this->mock->clearQueue();
$this->mock->addResponse(new GuzzleResponse(200, array(
'Date' => 'Tue, 04 Jun 2013 13:22:41 GMT',
)));
$client = new Client();
$client->setClient($guzzle);
$client->request('GET', 'http://www.example.com/');
$response = $client->getResponse();
$headers = $response->getHeaders();
$this->assertInternalType("array", array_shift($headers), "Header not converted from Guzzle\Http\Message\Header to array");
}
public function testNullResponseException()
{
$this->setExpectedException('GuzzleHttp\Exception\RequestException');
$guzzle = $this->getGuzzle();
$this->mock->clearQueue();
$exception = new RequestException('', $this->getMock('GuzzleHttp\Message\RequestInterface'));
$this->mock->addException($exception);
$client = new Client();
$client->setClient($guzzle);
$client->request('GET', 'http://www.example.com/');
$response = $client->getResponse();
}
protected function assertFile(PostFile $postFile, $fieldName, $fileName, $headers)
{
$this->assertEquals($postFile->getName(), $fieldName);
$this->assertEquals($postFile->getFilename(), $fileName);
$this->assertEquals($postFile->getHeaders(), $headers);
}
public function testHttps()
{
$guzzle = $this->getGuzzle();
$this->mock->clearQueue();
$this->mock->addResponse(new GuzzleResponse(200, [], Stream::factory('Test
')));
$client = new Client();
$client->setClient($guzzle);
$crawler = $client->request('GET', 'https://www.example.com/');
$this->assertEquals('https', $this->history->getLastRequest()->getScheme());
$this->assertEquals('Test', $crawler->filter('p')->text());
}
public function testCustomUserAgentConstructor()
{
$guzzle = $this->getGuzzle();
$client = new Client([
'HTTP_HOST' => '1.2.3.4',
'HTTP_USER_AGENT' => 'SomeHost'
]);
$client->setClient($guzzle);
$crawler = $client->request('GET', 'http://www.example.com/');
$this->assertEquals('SomeHost', $this->history->getLastRequest()->getHeader('User-Agent'));
}
}
PK s[AVUi* * Goutte/Resources/phar-stub.phpnu W+A
*
* This source file is subject to the MIT license that is bundled
* with this source code in the file LICENSE.
*/
require_once 'phar://'.__FILE__.'/vendor/autoload.php';
__HALT_COMPILER();
PK s[AV'_ Goutte/Client.phpnu W+A
*
* For the full copyright and license information, please view the LICENSE
* file that was distributed with this source code.
*/
namespace Goutte;
use GuzzleHttp\Client as GuzzleClient;
use GuzzleHttp\ClientInterface as GuzzleClientInterface;
use GuzzleHttp\Exception\RequestException;
use GuzzleHttp\Message\RequestInterface;
use GuzzleHttp\Message\Response as GuzzleResponse;
use GuzzleHttp\Post\PostFile;
use Symfony\Component\BrowserKit\Client as BaseClient;
use Symfony\Component\BrowserKit\Response;
/**
* Client.
*
* @author Fabien Potencier
* @author Michael Dowling
*/
class Client extends BaseClient
{
protected $client;
private $headers = array();
private $auth = null;
public function setClient(GuzzleClientInterface $client)
{
$this->client = $client;
return $this;
}
public function getClient()
{
if (!$this->client) {
$this->client = new GuzzleClient(array('defaults' => array('allow_redirects' => false, 'cookies' => true)));
}
return $this->client;
}
public function setHeader($name, $value)
{
$this->headers[$name] = $value;
return $this;
}
public function removeHeader($name)
{
unset($this->headers[$name]);
}
public function setAuth($user, $password = '', $type = 'basic')
{
$this->auth = array($user, $password, $type);
return $this;
}
public function resetAuth()
{
$this->auth = null;
return $this;
}
protected function doRequest($request)
{
$headers = array();
foreach ($request->getServer() as $key => $val) {
$key = implode('-', array_map('ucfirst', explode('-', strtolower(str_replace('_', '-', $key)))));
$contentHeaders = array('Content-length' => true, 'Content-md5' => true, 'Content-type' => true);
if (0 === strpos($key, 'Http-')) {
$headers[substr($key, 5)] = $val;
}
// CONTENT_* are not prefixed with HTTP_
elseif (isset($contentHeaders[$key])) {
$headers[$key] = $val;
}
}
$body = null;
if (!in_array($request->getMethod(), array('GET','HEAD'))) {
if (null !== $request->getContent()) {
$body = $request->getContent();
} else {
$body = $request->getParameters();
}
}
$this->getClient()->setDefaultOption('auth', $this->auth);
$requestOptions = array(
'body' => $body,
'cookies' => $this->getCookieJar()->allRawValues($request->getUri()),
'allow_redirects' => false,
'timeout' => 30,
);
if (!empty($headers)) {
$requestOptions['headers'] = $headers;
}
$guzzleRequest = $this->getClient()->createRequest(
$request->getMethod(),
$request->getUri(),
$requestOptions
);
foreach ($this->headers as $name => $value) {
$guzzleRequest->setHeader($name, $value);
}
if ('POST' == $request->getMethod() || 'PUT' == $request->getMethod()) {
$this->addPostFiles($guzzleRequest, $request->getFiles());
}
// Let BrowserKit handle redirects
try {
$response = $this->getClient()->send($guzzleRequest);
} catch (RequestException $e) {
$response = $e->getResponse();
if (null === $response) {
throw $e;
}
}
return $this->createResponse($response);
}
protected function addPostFiles(RequestInterface $request, array $files, $arrayName = '')
{
foreach ($files as $name => $info) {
if (!empty($arrayName)) {
$name = $arrayName.'['.$name.']';
}
if (is_array($info)) {
if (isset($info['tmp_name'])) {
if ('' !== $info['tmp_name']) {
$request->getBody()->addFile(new PostFile($name, fopen($info['tmp_name'], 'r')));
} else {
continue;
}
} else {
$this->addPostFiles($request, $info, $name);
}
} else {
$request->getBody()->addFile(new PostFile($name, fopen($info, 'r')));
}
}
}
protected function createResponse(GuzzleResponse $response)
{
$headers = $response->getHeaders();
return new Response($response->getBody(true), $response->getStatusCode(), $headers);
}
}
PK s[AVg!
README.rstnu W+A Goutte, a simple PHP Web Scraper
================================
Goutte is a screen scraping and web crawling library for PHP.
Goutte provides a nice API to crawl websites and extract data from the HTML/XML
responses.
Requirements
------------
Goutte depends on PHP 5.4+ and Guzzle 4+.
.. tip::
If you need support for PHP 5.3 or Guzzle 3, use Goutte 1.0.6.
Installation
------------
Add ``fabpot/goutte`` as a require dependency in your ``composer.json`` file:
.. code-block:: bash
php composer.phar require fabpot/goutte:~2.0
.. tip::
You can also download the `Goutte.phar`_ file:
.. code-block:: php
require_once '/path/to/goutte.phar';
Usage
-----
Create a Goutte Client instance (which extends
``Symfony\Component\BrowserKit\Client``):
.. code-block:: php
use Goutte\Client;
$client = new Client();
Make requests with the ``request()`` method:
.. code-block:: php
// Go to the symfony.com website
$crawler = $client->request('GET', 'http://www.symfony.com/blog/');
The method returns a ``Crawler`` object
(``Symfony\Component\DomCrawler\Crawler``).
Click on links:
.. code-block:: php
// Click on the "Security Advisories" link
$link = $crawler->selectLink('Security Advisories')->link();
$crawler = $client->click($link);
Extract data:
.. code-block:: php
// Get the latest post in this category and display the titles
$crawler->filter('h2.post > a')->each(function ($node) {
print $node->text()."\n";
});
Submit forms:
.. code-block:: php
$crawler = $client->request('GET', 'http://github.com/');
$crawler = $client->click($crawler->selectLink('Sign in')->link());
$form = $crawler->selectButton('Sign in')->form();
$crawler = $client->submit($form, array('login' => 'fabpot', 'password' => 'xxxxxx'));
$crawler->filter('.flash-error')->each(function ($node) {
print $node->text()."\n";
});
More Information
----------------
Read the documentation of the BrowserKit and DomCrawler Symfony Components for
more information about what you can do with Goutte.
Technical Information
---------------------
Goutte is a thin wrapper around the following fine PHP libraries:
* Symfony Components: BrowserKit, ClassLoader, CssSelector, DomCrawler, Finder,
and Process;
* `Guzzle`_ HTTP Component.
License
-------
Goutte is licensed under the MIT license.
.. _`Composer`: http://getcomposer.org
.. _`Goutte.phar`: http://get.sensiolabs.org/goutte.phar
.. _`Guzzle`: http://docs.guzzlephp.org
PK s[AV] .travis.ymlnu W+A language: php
php:
- '5.6'
- '5.5'
- '5.4'
- hhvm
before_script:
- composer install -n
script:
- phpunit
matrix:
allow_failures:
- php: hhvm
PK s[AV box.jsonnu W+A {
"output": "goutte.phar",
"chmod": "0755",
"compactors": [
"Herrera\\Box\\Compactor\\Php"
],
"extract": false,
"files": [
"LICENSE",
"Goutte/Client.php"
],
"finder": [
{
"name": ["*.php", "*.pem*"],
"exclude": ["Tests", "tests"],
"in": "vendor"
}
],
"stub": "Goutte/Resources/phar-stub.php",
"web": false
}
PK s[AV{
composer.jsonnu W+A {
"name": "fabpot/goutte",
"type": "application",
"description": "A simple PHP Web Scraper",
"keywords": ["scraper"],
"homepage": "https://github.com/fabpot/Goutte",
"license": "MIT",
"authors": [
{
"name": "Fabien Potencier",
"email": "fabien@symfony.com"
}
],
"require": {
"php": ">=5.4.0",
"symfony/browser-kit": "~2.1",
"symfony/css-selector": "~2.1",
"symfony/dom-crawler": "~2.1",
"guzzlehttp/guzzle": "4.*"
},
"autoload": {
"psr-0": { "Goutte": "." }
},
"extra": {
"branch-alias": {
"dev-master": "2.0-dev"
}
}
}
PK s[AV
! !
.gitignorenu W+A composer.lock
phpunit.xml
vendor
PK s[AV_A43 3 phpunit.xml.distnu W+A PK s[AVT) ) s LICENSEnu W+A PK s[AV8迏) ) Goutte/Tests/ClientTest.phpnu W+A PK s[AVUi* * 0 Goutte/Resources/phar-stub.phpnu W+A PK s[AV'_ V2 Goutte/Client.phpnu W+A PK s[AVg!
E README.rstnu W+A PK s[AV] O .travis.ymlnu W+A PK s[AV P box.jsonnu W+A PK s[AV{
R composer.jsonnu W+A PK s[AV
! !
{U .gitignorenu W+A PK
U