PK (AV_A43 3 phpunit.xml.distnu W+A„¶
./Goutte/Tests
PK (AVØìT) ) LICENSEnu W+A„¶ Copyright (c) 2010-2013 Fabien Potencier
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is furnished
to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
PK (AV•M·¹à# à# Goutte/Tests/ClientTest.phpnu W+A„¶
*
* For the full copyright and license information, please view the LICENSE
* file that was distributed with this source code.
*/
namespace Goutte\Tests;
use Goutte\Client;
use Symfony\Component\BrowserKit\Cookie;
use Guzzle\Http\Message\Response as GuzzleResponse;
use Guzzle\Http\Message\Header as GuzzleHeader;
use Guzzle\Http\Client as GuzzleClient;
use Guzzle\Plugin\Mock\MockPlugin;
use Guzzle\Plugin\History\HistoryPlugin;
use Guzzle\Http\Message\Response;
use Guzzle\Http\Message\PostFile;
/**
* Goutte Client Test
*
* @author Michael Dowling
*/
class ClientTest extends \PHPUnit_Framework_TestCase
{
protected $historyPlugin;
protected $mockPlugin;
protected function getGuzzle()
{
$this->historyPlugin = new HistoryPlugin();
$this->mockPlugin = new MockPlugin();
$this->mockPlugin->addResponse(new GuzzleResponse(200, null, 'Hi
'));
$guzzle = new GuzzleClient('', array('redirect.disable' => true));
$guzzle->getEventDispatcher()->addSubscriber($this->mockPlugin);
$guzzle->getEventDispatcher()->addSubscriber($this->historyPlugin);
return $guzzle;
}
public function testCreatesDefaultClient()
{
$client = new Client();
$this->assertInstanceOf('Guzzle\\Http\\ClientInterface', $client->getClient());
}
public function testUsesCustomClient()
{
$guzzle = new GuzzleClient();
$client = new Client();
$this->assertSame($client, $client->setClient($guzzle));
$this->assertSame($guzzle, $client->getClient());
}
public function testUsesCustomHeaders()
{
$guzzle = $this->getGuzzle();
$client = new Client();
$client->setClient($guzzle);
$client->setHeader('X-Test', 'test');
$crawler = $client->request('GET', 'http://www.example.com/');
$this->assertEquals('test', $this->historyPlugin->getLastRequest()->getHeader('X-Test'));
}
public function testCustomUserAgent()
{
$guzzle = $this->getGuzzle();
$client = new Client();
$client->setClient($guzzle);
$client->setHeader('User-Agent', 'foo');
$crawler = $client->request('GET', 'http://www.example.com/');
$this->assertEquals('foo', $this->historyPlugin->getLastRequest()->getHeader('User-Agent'));
}
public function testUsesAuth()
{
$guzzle = $this->getGuzzle();
$client = new Client();
$client->setClient($guzzle);
$client->setAuth('me', '**');
$crawler = $client->request('GET', 'http://www.example.com/');
$request = $this->historyPlugin->getLastRequest();
$this->assertEquals('me', $request->getUsername());
$this->assertEquals('**', $request->getPassword());
}
public function testResetsAuth()
{
$guzzle = $this->getGuzzle();
$client = new Client();
$client->setClient($guzzle);
$client->setAuth('me', '**');
$client->resetAuth();
$crawler = $client->request('GET', 'http://www.example.com/');
$request = $this->historyPlugin->getLastRequest();
$this->assertNull($request->getUsername());
$this->assertNull($request->getPassword());
}
public function testUsesCookies()
{
$guzzle = $this->getGuzzle();
$client = new Client();
$client->setClient($guzzle);
$client->getCookieJar()->set(new Cookie('test', '123'));
$crawler = $client->request('GET', 'http://www.example.com/');
$request = $this->historyPlugin->getLastRequest();
$this->assertEquals('123', $request->getCookie('test'));
}
public function testUsesPostFiles()
{
$guzzle = $this->getGuzzle();
$client = new Client();
$client->setClient($guzzle);
$files = array(
'test' => array(
'name' => 'test.txt',
'tmp_name' => __FILE__
)
);
$crawler = $client->request('POST', 'http://www.example.com/', array(), $files);
$request = $this->historyPlugin->getLastRequest();
$this->assertEquals(array(
'test' => array(
new PostFile('test', __FILE__, 'text/x-php')
)
), $request->getPostFiles());
}
public function testUsesPostNamedFiles()
{
$guzzle = $this->getGuzzle();
$client = new Client();
$client->setClient($guzzle);
$files = array(
'test' => __FILE__
);
$crawler = $client->request('POST', 'http://www.example.com/', array(), $files);
$request = $this->historyPlugin->getLastRequest();
$this->assertEquals(array(
'test' => array(
new PostFile('test', __FILE__, 'text/x-php')
)
), $request->getPostFiles());
}
public function testUsesPostFilesNestedFields()
{
$guzzle = $this->getGuzzle();
$client = new Client();
$client->setClient($guzzle);
$files = array(
'form' => array(
'test' => array(
'name' => 'test.txt',
'tmp_name' => __FILE__
),
),
);
$crawler = $client->request('POST', 'http://www.example.com/', array(), $files);
$request = $this->historyPlugin->getLastRequest();
$this->assertEquals(array(
'form[test]' => array(
new PostFile('form[test]', __FILE__, 'text/x-php')
)
), $request->getPostFiles());
}
public function testUsesPostFilesOnClientSide()
{
$guzzle = $this->getGuzzle();
$client = new Client();
$client->setClient($guzzle);
$files = array(
'test' => __FILE__,
);
$crawler = $client->request('POST', 'http://www.example.com/', array(), $files);
$request = $this->historyPlugin->getLastRequest();
$this->assertEquals(array(
'test' => array(
new PostFile('test', __FILE__, 'text/x-php')
)
), $request->getPostFiles());
}
public function testUsesPostFilesUploadError()
{
$guzzle = $this->getGuzzle();
$client = new Client();
$client->setClient($guzzle);
$files = array(
'test' => array(
'name' => '',
'type' => '',
'tmp_name' => '',
'error' => 4,
'size' => 0,
),
);
$crawler = $client->request('POST', 'http://www.example.com/', array(), $files);
$request = $this->historyPlugin->getLastRequest();
$this->assertEquals(array(), $request->getPostFiles());
}
public function testUsesCurlOptions()
{
$guzzle = $this->getGuzzle();
$client = new Client();
$client->setClient($guzzle);
$crawler = $client->request('GET', 'http://www.example.com/');
$request = $this->historyPlugin->getLastRequest();
$this->assertEquals(0, $request->getCurlOptions()->get(CURLOPT_MAXREDIRS));
$this->assertEquals(30, $request->getCurlOptions()->get(CURLOPT_TIMEOUT));
}
public function testCreatesResponse()
{
$guzzle = $this->getGuzzle();
$client = new Client();
$client->setClient($guzzle);
$crawler = $client->request('GET', 'http://www.example.com/');
$this->assertEquals('Hi', $crawler->filter('p')->text());
}
public function testHandlesRedirectsCorrectly()
{
$guzzle = $this->getGuzzle();
$this->mockPlugin->clearQueue();
$this->mockPlugin->addResponse(new GuzzleResponse(301, array(
'Location' => 'http://www.example.com/'
)));
$this->mockPlugin->addResponse(new GuzzleResponse(200, null, 'Test
'));
$client = new Client();
$client->setClient($guzzle);
$crawler = $client->request('GET', 'http://www.example.com/');
$this->assertEquals('Test', $crawler->filter('p')->text());
// Ensure that two requests were sent
$this->assertEquals(2, count($this->historyPlugin));
}
public function testConvertsGuzzleHeadersToArrays()
{
if (!class_exists("Guzzle\Http\Message\Header")) {
$this->markTestSkipped("Guzzle ~3.6 required");
}
$guzzle = $this->getGuzzle();
$this->mockPlugin->clearQueue();
$this->mockPlugin->addResponse(new GuzzleResponse(200, array(
new GuzzleHeader('Date', 'Tue, 04 Jun 2013 13:22:41 GMT'),
)));
$client = new Client();
$client->setClient($guzzle);
$client->request('GET', 'http://www.example.com/');
$response = $client->getResponse();
$headers = $response->getHeaders();
$this->assertInternalType("array", array_shift($headers), "Header not converted from Guzzle\Http\Message\Header to array");
}
}
PK (AVUiºþ* * Goutte/Resources/phar-stub.phpnu W+A„¶
*
* This source file is subject to the MIT license that is bundled
* with this source code in the file LICENSE.
*/
require_once 'phar://'.__FILE__.'/vendor/autoload.php';
__HALT_COMPILER();
PK (AVçyÊ(÷ ÷ Goutte/Client.phpnu W+A„¶
*
* For the full copyright and license information, please view the LICENSE
* file that was distributed with this source code.
*/
namespace Goutte;
use Symfony\Component\BrowserKit\Client as BaseClient;
use Symfony\Component\BrowserKit\Response;
use Guzzle\Http\Exception\CurlException;
use Guzzle\Http\Exception\BadResponseException;
use Guzzle\Http\Message\Response as GuzzleResponse;
use Guzzle\Http\ClientInterface as GuzzleClientInterface;
use Guzzle\Http\Client as GuzzleClient;
use Guzzle\Http\Message\EntityEnclosingRequestInterface;
/**
* Client.
*
* @package Goutte
* @author Fabien Potencier
* @author Michael Dowling
*/
class Client extends BaseClient
{
const VERSION = '0.2';
protected $headers = array();
protected $auth = null;
protected $client;
public function setClient(GuzzleClientInterface $client)
{
$this->client = $client;
return $this;
}
public function getClient()
{
if (!$this->client) {
$this->client = new GuzzleClient('', array(GuzzleClient::DISABLE_REDIRECTS => true));
}
return $this->client;
}
public function setHeader($name, $value)
{
$this->headers[$name] = $value;
return $this;
}
public function removeHeader($name)
{
unset($this->headers[$name]);
}
public function setAuth($user, $password = '', $type = CURLAUTH_BASIC)
{
$this->auth = array(
'user' => $user,
'password' => $password,
'type' => $type
);
return $this;
}
public function resetAuth()
{
$this->auth = null;
return $this;
}
protected function doRequest($request)
{
$headers = array();
foreach ($request->getServer() as $key => $val) {
$key = implode('-', array_map('ucfirst', explode('-', strtolower(str_replace(array('_', 'HTTP-'), array('-', ''), $key)))));
if (!isset($headers[$key])) {
$headers[$key] = $val;
}
}
$body = null;
if (!in_array($request->getMethod(), array('GET','HEAD'))) {
if (null !== $request->getContent()) {
$body = $request->getContent();
} else {
$body = $request->getParameters();
}
}
$guzzleRequest = $this->getClient()->createRequest(
$request->getMethod(),
$request->getUri(),
$headers,
$body
);
foreach ($this->headers as $name => $value) {
$guzzleRequest->setHeader($name, $value);
}
if ($this->auth !== null) {
$guzzleRequest->setAuth(
$this->auth['user'],
$this->auth['password'],
$this->auth['type']
);
}
foreach ($this->getCookieJar()->allRawValues($request->getUri()) as $name => $value) {
$guzzleRequest->addCookie($name, $value);
}
if ('POST' == $request->getMethod() || 'PUT' == $request->getMethod()) {
$this->addPostFiles($guzzleRequest, $request->getFiles());
}
$guzzleRequest->getParams()->set('redirect.disable', true);
$curlOptions = $guzzleRequest->getCurlOptions();
if (!$curlOptions->hasKey(CURLOPT_TIMEOUT)) {
$curlOptions->set(CURLOPT_TIMEOUT, 30);
}
// Let BrowserKit handle redirects
try {
$response = $guzzleRequest->send();
} catch (CurlException $e) {
if (!strpos($e->getMessage(), 'redirects')) {
throw $e;
}
$response = $e->getResponse();
} catch (BadResponseException $e) {
$response = $e->getResponse();
}
return $this->createResponse($response);
}
protected function addPostFiles($request, array $files, $arrayName = '')
{
if (!$request instanceof EntityEnclosingRequestInterface) {
return;
}
foreach ($files as $name => $info) {
if (!empty($arrayName)) {
$name = $arrayName . '[' . $name . ']';
}
if (is_array($info)) {
if (isset($info['tmp_name'])) {
if ('' !== $info['tmp_name']) {
$request->addPostFile($name, $info['tmp_name']);
} else {
continue;
}
} else {
$this->addPostFiles($request, $info, $name);
}
} else {
$request->addPostFile($name, $info);
}
}
}
protected function createResponse(GuzzleResponse $response)
{
$headers = $response->getHeaders()->toArray();
return new Response($response->getBody(true), $response->getStatusCode(), $headers);
}
}
PK (AVyŒoJ
README.rstnu W+A„¶ Goutte, a simple PHP Web Scraper
================================
Goutte is a screen scraping and web crawling library for PHP.
Goutte provides a nice API to crawl websites and extract data from the HTML/XML
responses.
Requirements
------------
Goutte works with PHP 5.3.3 or later.
Installation
------------
Add ``fabpot/goutte`` as a require dependency in your ``composer.json`` file:
.. code-block:: bash
php composer.phar require fabpot/goutte:~1.0
.. tip::
You can also download the `Goutte.phar`_ file:
.. code-block:: php
require_once '/path/to/goutte.phar';
Usage
-----
Create a Goutte Client instance (which extends
``Symfony\Component\BrowserKit\Client``):
.. code-block:: php
use Goutte\Client;
$client = new Client();
Make requests with the ``request()`` method:
.. code-block:: php
// Go to the symfony.com website
$crawler = $client->request('GET', 'http://www.symfony.com/blog/');
The method returns a ``Crawler`` object
(``Symfony\Component\DomCrawler\Crawler``).
Click on links:
.. code-block:: php
// Click on the "Security Advisories" link
$link = $crawler->selectLink('Security Advisories')->link();
$crawler = $client->click($link);
Extract data:
.. code-block:: php
// Get the latest post in this category and display the titles
$crawler->filter('h2.post > a')->each(function ($node) {
print $node->text()."\n";
});
Submit forms:
.. code-block:: php
$crawler = $client->request('GET', 'http://github.com/');
$crawler = $client->click($crawler->selectLink('Sign in')->link());
$form = $crawler->selectButton('Sign in')->form();
$crawler = $client->submit($form, array('login' => 'fabpot', 'password' => 'xxxxxx'));
$crawler->filter('.flash-error')->each(function ($node) {
print $node->text()."\n";
});
More Information
----------------
Read the documentation of the BrowserKit and DomCrawler Symfony Components for
more information about what you can do with Goutte.
Technical Information
---------------------
Goutte is a thin wrapper around the following fine PHP libraries:
* Symfony Components: BrowserKit, ClassLoader, CssSelector, DomCrawler, Finder,
and Process;
* `Guzzle`_ HTTP Component.
License
-------
Goutte is licensed under the MIT license.
.. _`Composer`: http://getcomposer.org
.. _`Goutte.phar`: http://get.sensiolabs.org/goutte.phar
.. _`Guzzle`: http://docs.guzzlephp.org
PK (AV"Þû÷Á Á .travis.ymlnu W+A„¶ language: php
php:
- '5.6'
- '5.5'
- '5.4'
- '5.3'
- hhvm
before_script:
- composer install -n
script:
- phpunit
matrix:
allow_failures:
- php: hhvm
PK (AVôž¸¬ ¬ box.jsonnu W+A„¶ {
"output": "goutte.phar",
"chmod": "0755",
"compactors": [
"Herrera\\Box\\Compactor\\Php"
],
"extract": false,
"files": [
"LICENSE",
"Goutte/Client.php"
],
"finder": [
{
"name": ["*.php", "*.pem*"],
"exclude": ["Tests", "tests"],
"in": "vendor"
}
],
"stub": "Goutte/Resources/phar-stub.php",
"web": false
}
PK (AVDÔðrw w
composer.jsonnu W+A„¶ {
"name": "fabpot/goutte",
"type": "application",
"description": "A simple PHP Web Scraper",
"keywords": ["scraper"],
"homepage": "https://github.com/fabpot/Goutte",
"license": "MIT",
"authors": [
{
"name": "Fabien Potencier",
"email": "fabien@symfony.com"
}
],
"require": {
"php": ">=5.3.0",
"ext-curl": "*",
"symfony/browser-kit": "~2.1",
"symfony/css-selector": "~2.1",
"symfony/dom-crawler": "~2.1",
"symfony/finder": "~2.1",
"symfony/process": "~2.1",
"guzzle/http": "~3.1"
},
"require-dev": {
"guzzle/plugin-history": "~3.1",
"guzzle/plugin-mock": "~3.1"
},
"autoload": {
"psr-0": { "Goutte": "." }
},
"extra": {
"branch-alias": {
"dev-master": "1.0-dev"
}
}
}
PK (AVúé
! !
.gitignorenu W+A„¶ composer.lock
phpunit.xml
vendor
PK (AV_A43 3 phpunit.xml.distnu W+A„¶ PK (AVØìT) ) s LICENSEnu W+A„¶ PK (AV•M·¹à# à# Ó Goutte/Tests/ClientTest.phpnu W+A„¶ PK (AVUiºþ* * þ* Goutte/Resources/phar-stub.phpnu W+A„¶ PK (AVçyÊ(÷ ÷ v, Goutte/Client.phpnu W+A„¶ PK (AVyŒoJ
®@ README.rstnu W+A„¶ PK (AV"Þû÷Á Á ˆJ .travis.ymlnu W+A„¶ PK (AVôž¸¬ ¬ „K box.jsonnu W+A„¶ PK (AVDÔðrw w
hM composer.jsonnu W+A„¶ PK (AVúé
! !
Q .gitignorenu W+A„¶ PK
wQ