From 0883adca546675d1546e16be817737bbbbb04d90 Mon Sep 17 00:00:00 2001 From: Alexander Artemenko Date: Mon, 18 Mar 2024 19:52:47 +0000 Subject: [PATCH] Add tutorial and docs. --- .github/workflows/ci.yml | 36 +- .github/workflows/docs.yml | 36 +- .github/workflows/linter.yml | 36 +- ChangeLog.md | 2 +- README.md | 979 +++++++++++++++++++++++++++++++---- docs/changelog.lisp | 2 +- docs/index.lisp | 15 +- docs/tutorial.lisp | 892 +++++++++++++++++++++++++++++++ scrapycl.asd | 4 +- src/core.lisp | 21 +- src/downloader.lisp | 10 +- src/engine.lisp | 5 +- src/errors.lisp | 12 +- src/output.lisp | 5 - src/output/json.lisp | 74 +-- src/output/typed.lisp | 6 +- src/request.lisp | 7 +- src/types.lisp | 12 + src/utils.lisp | 3 +- tutorial/step1.lisp | 14 +- tutorial/step2.lisp | 12 +- tutorial/step3.lisp | 5 +- tutorial/tutorial.md | 751 --------------------------- 23 files changed, 1908 insertions(+), 1031 deletions(-) create mode 100644 docs/tutorial.lisp delete mode 100644 src/output.lisp create mode 100644 src/types.lisp diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index cd5384c..fce890d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -33,41 +33,15 @@ "steps": [ { "name": "Checkout Code", - "uses": "actions/checkout@v3" - }, - { - "name": "Grant All Perms to Make Cache Restoring Possible", - "run": "sudo mkdir -p /usr/local/etc/roswell\n sudo chown \"${USER}\" /usr/local/etc/roswell\n # Here the ros binary will be restored:\n sudo chown \"${USER}\" /usr/local/bin", - "shell": "bash" - }, - { - "name": "Get Current Month", - "id": "current-month", - "run": "echo \"value=$(date -u \"+%Y-%m\")\" >> $GITHUB_OUTPUT", - "shell": "bash" - }, - { - "name": "Cache Roswell Setup", - "id": "cache", - "uses": "actions/cache@v3", - "with": { - "path": "qlfile\nqlfile.lock\n~/.cache/common-lisp/\n~/.roswell\n/usr/local/etc/roswell\n/usr/local/bin/ros\n/usr/local/Cellar/roswell\n.qlot", - "key": "a-${{ steps.current-month.outputs.value }}-${{ env.cache-name }}-ubuntu-latest-quicklisp-${{ matrix.lisp }}-${{ hashFiles('qlfile.lock', '*.asd') }}" - } - }, - { - "name": "Restore Path To Cached Files", - "run": "echo $HOME/.roswell/bin >> $GITHUB_PATH\n echo .qlot/bin >> $GITHUB_PATH", - "shell": "bash", - "if": "steps.cache.outputs.cache-hit == 'true'" + "uses": "actions/checkout@v4" }, { "name": "Setup Common Lisp Environment", - "uses": "40ants/setup-lisp@v3", + "uses": "40ants/setup-lisp@v4", "with": { - "asdf-system": "scrapycl" - }, - "if": "steps.cache.outputs.cache-hit != 'true'" + "asdf-system": "scrapycl", + "cache": "true" + } }, { "name": "Run Tests", diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 8a54ccb..cfada9e 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -24,41 +24,15 @@ "steps": [ { "name": "Checkout Code", - "uses": "actions/checkout@v3" - }, - { - "name": "Grant All Perms to Make Cache Restoring Possible", - "run": "sudo mkdir -p /usr/local/etc/roswell\n sudo chown \"${USER}\" /usr/local/etc/roswell\n # Here the ros binary will be restored:\n sudo chown \"${USER}\" /usr/local/bin", - "shell": "bash" - }, - { - "name": "Get Current Month", - "id": "current-month", - "run": "echo \"value=$(date -u \"+%Y-%m\")\" >> $GITHUB_OUTPUT", - "shell": "bash" - }, - { - "name": "Cache Roswell Setup", - "id": "cache", - "uses": "actions/cache@v3", - "with": { - "path": "qlfile\nqlfile.lock\n~/.cache/common-lisp/\n~/.roswell\n/usr/local/etc/roswell\n/usr/local/bin/ros\n/usr/local/Cellar/roswell\n.qlot", - "key": "a-${{ steps.current-month.outputs.value }}-${{ env.cache-name }}-ubuntu-latest-quicklisp-sbcl-bin-${{ hashFiles('qlfile.lock', '*.asd') }}" - } - }, - { - "name": "Restore Path To Cached Files", - "run": "echo $HOME/.roswell/bin >> $GITHUB_PATH\n echo .qlot/bin >> $GITHUB_PATH", - "shell": "bash", - "if": "steps.cache.outputs.cache-hit == 'true'" + "uses": "actions/checkout@v4" }, { "name": "Setup Common Lisp Environment", - "uses": "40ants/setup-lisp@v3", + "uses": "40ants/setup-lisp@v4", "with": { - "asdf-system": "scrapycl-docs" - }, - "if": "steps.cache.outputs.cache-hit != 'true'" + "asdf-system": "scrapycl-docs", + "cache": "true" + } }, { "name": "Build Docs", diff --git a/.github/workflows/linter.yml b/.github/workflows/linter.yml index 55e5ec7..4840522 100644 --- a/.github/workflows/linter.yml +++ b/.github/workflows/linter.yml @@ -24,41 +24,15 @@ "steps": [ { "name": "Checkout Code", - "uses": "actions/checkout@v3" - }, - { - "name": "Grant All Perms to Make Cache Restoring Possible", - "run": "sudo mkdir -p /usr/local/etc/roswell\n sudo chown \"${USER}\" /usr/local/etc/roswell\n # Here the ros binary will be restored:\n sudo chown \"${USER}\" /usr/local/bin", - "shell": "bash" - }, - { - "name": "Get Current Month", - "id": "current-month", - "run": "echo \"value=$(date -u \"+%Y-%m\")\" >> $GITHUB_OUTPUT", - "shell": "bash" - }, - { - "name": "Cache Roswell Setup", - "id": "cache", - "uses": "actions/cache@v3", - "with": { - "path": "qlfile\nqlfile.lock\n~/.cache/common-lisp/\n~/.roswell\n/usr/local/etc/roswell\n/usr/local/bin/ros\n/usr/local/Cellar/roswell\n.qlot", - "key": "a-${{ steps.current-month.outputs.value }}-${{ env.cache-name }}-ubuntu-latest-quicklisp-sbcl-bin-${{ hashFiles('qlfile.lock', '*.asd') }}" - } - }, - { - "name": "Restore Path To Cached Files", - "run": "echo $HOME/.roswell/bin >> $GITHUB_PATH\n echo .qlot/bin >> $GITHUB_PATH", - "shell": "bash", - "if": "steps.cache.outputs.cache-hit == 'true'" + "uses": "actions/checkout@v4" }, { "name": "Setup Common Lisp Environment", - "uses": "40ants/setup-lisp@v3", + "uses": "40ants/setup-lisp@v4", "with": { - "asdf-system": "scrapycl" - }, - "if": "steps.cache.outputs.cache-hit != 'true'" + "asdf-system": "scrapycl", + "cache": "true" + } }, { "name": "Change dist to Ultralisp if qlfile does not exist", diff --git a/ChangeLog.md b/ChangeLog.md index 89d4a27..e8a990b 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -4,7 +4,7 @@ -## 0.1.0 (2023-02-05) +## 0.1.0 (2024-03-17) * Initial version. diff --git a/README.md b/README.md index 4082743..6a5b3d4 100644 --- a/README.md +++ b/README.md @@ -6,14 +6,13 @@ ## SCRAPYCL ASDF System Details -* Version: 0.1.0 * Description: The web scraping framework for writing crawlers in Common Lisp. * Licence: Unlicense * Author: Alexander Artemenko * Homepage: [https://40ants.com/scrapycl/][174b] * Bug tracker: [https://github.com/40ants/scrapycl/issues][1e8e] * Source control: [GIT][66a8] -* Depends on: [40ants-doc][2c00], [alexandria][8236], [bordeaux-threads][3dbf], [dexador][8347], [log4cl][7f8b], [log4cl-extras][691c], [lquery][557a], [quri][2103], [serapeum][c41d], [spinneret][8175], [str][ef7f] +* Depends on: [40ants-doc][2c00], [alexandria][8236], [bordeaux-threads][3dbf], [closer-mop][61a4], [dexador][8347], [log4cl][7f8b], [log4cl-extras][691c], [lquery][557a], [quri][2103], [serapeum][c41d], [spinneret][8175], [str][ef7f], [yason][aba2] [![](https://github-actions.40ants.com/40ants/scrapycl/matrix.svg?only=ci.run-tests)][5d38] @@ -30,11 +29,805 @@ You can install this library from Quicklisp, but you want to receive updates qui :prompt nil) (ql:quickload :scrapycl) ``` - + -## Usage +## Tutorial -`TODO`: Write a library description. Put some examples here. + + +### Introduction + +In this tutorial we'll train our parsing skill on this toy site: https://quotes.toscrape.com/. +We will follow [Scrapy's tutorial][b2a5] and see if we can get all the data using Scrapy`CL`. + +You will find whole code for this tutorial in the `tutorial/` folder. + +Firstly Scrapy tutorial shows us how to experiment with `HTTP` response in the `REPL`. But with Common Lisp we have much more sofisticated `REPL` out of the box. So we skip this step: + +``` +scrapy shell "https://quotes.toscrape.com/page/1/" +``` +Right to the Common Lisp `REPL`! + + + +### Our First Scraper + +Scrapy`CL` is built around `CLOS`. Every scraping pipeline in this framework operates on `CLOS` objects. +Most generic-functions accept a [`scrapycl:spider`][dcea] object as a first argument. Also, requests to `HTML` pages are typed. +This way you are telling to the framework how each page should be processed. + +First thing we need to do is to define a class of the request to a page with quotes: + +```lisp +CL-USER> (defclass quotes-page-request (scrapycl:request) + ()) +# +``` +Next, we define a class for the spider: + +```lisp +CL-USER> (defclass quotes-spider (scrapycl:spider) + () + (:default-initargs + :initial-requests (list (make-instance 'quotes-page-request + :url "https://quotes.toscrape.com/page/1/") + (make-instance 'quotes-page-request + :url "https://quotes.toscrape.com/page/2/")))) +``` +Here we tell the spider to start from two initial pages. + +Now it is time to make our first `HTTP` request and to see content of the page. +I'll save the page's content to a variable to be able to play with parsing. + +```lisp +CL-USER> (defparameter *response* + (scrapycl:fetch (make-instance 'quotes-spider) + (make-instance 'quotes-page-request + :url "https://quotes.toscrape.com/page/1/"))) +*RESPONSE* +CL-USER> *response* +" + + + + Quotes to Scrape + + (scrapycl:start (make-instance 'quotes-spider) :wait t) +(# + #) +``` +It returns initial page requests as is because we didn't write a method for [`scrapycl:process`][0482] generic-function. Now we'll define it to save content into the files: + +``` +CL-USER> (defmethod scrapycl:process ((spider quotes-spider) + (request quotes-page-request)) + (multiple-value-bind (data url) + (scrapycl:fetch spider request) + (let* ((page-number (third (str:split "/" (quri:uri-path url)))) + (filename (format nil "quotes-~A.html" page-number))) + (alexandria:write-string-into-file data filename + :if-exists :supersede) + (log:info "Page saved to" filename) + ;; return nothing, to stop processing + (values)))) +# +``` +Next attempt to start the scraper will output information that data was saved to the files: + +``` +CL-USER> (scrapycl:start (make-instance 'quotes-spider) :wait t) + [18:30:37] cl-user (process quotes-spider quotes-page-request) - + Page saved to FILENAME: "quotes-2.html" + [18:30:37] cl-user (process quotes-spider quotes-page-request) - + Page saved to FILENAME: "quotes-1.html" +NIL +``` +Now it is time to extract useful information out from these `HTML` pages. + + + +### Extracting the Data + +Where Scrapy shows iPython `REPL`: + +``` +>>> response.css("title::text").getall() +['Quotes to Scrape'] +``` +We have a full-featured Common Lisp `REPL`. For `HTML` parsing we'll use great [lQuery library][8cd8]. Here is how we can reproduce python code in Lisp using lquery `DSL`: + +``` +CL-USER> (lquery:$ + (initialize *response*) + "title" + (text)) +#("Quotes to Scrape") +``` +lQuery is parses data in a functional way. It has [amazing documentation][8cd8]. Take a moment and read it to understand the basic principles. + +Then Python tutorial shows us what `getall` method returns: + +``` +response.css("title").getall() +['Quotes to Scrape'] +``` +With lisp we could do the same. Just drop `(text)` form at the end of the lquery pipeline: + +``` +CL-USER> (lquery:$ + (initialize *response*) + "title") +#(#) +``` +But this code returns us a set of `HTML` nodes. If you want to see the actual `HTML` code behind it, +use `(serialize)` form: + +``` +CL-USER> (lquery:$ + (initialize *response*) + "title" + (serialize)) +#("Quotes to Scrape") +``` +To get only a single item in Python you use `get` method instead of `getall`: + +``` +>>> response.css("title::text").get() +'Quotes to Scrape' +``` +In Lisp use `lquery:$1` macro instead of `lquery:$`: + +``` +CL-USER> (lquery:$1 + (initialize *response*) + "title" + (text)) +"Quotes to Scrape" +``` +You see, it returns a single object instead of an array! + +As an alternative, you could’ve written in Python: + +``` +>>> response.css("title::text")[0].get() +'Quotes to Scrape' +``` +In Lisp we can do the same: + +``` +CL-USER> (let* ((nodes (lquery:$ + (initialize *response*) + "title")) + (title-node (elt nodes 0))) + (plump:text title-node)) +"Quotes to Scrape" +``` +There is no analogue to `re` from Scrapy in lQuery: + +``` +>>> response.css("title::text").re(r"Quotes.*") +['Quotes to Scrape'] +``` +but you can use a filter function: + +``` +CL-USER> (lquery:$ + (initialize *response*) + "title" + (text) + (filter (lambda (text) + (cl-ppcre:scan "Quotes.*" text)))) +#("Quotes to Scrape") +``` +Another example from Python code: + +``` +>>> response.css("title::text").re(r"Q\w+") +['Quotes'] +``` +Becomes in Lisp: + +``` +CL-USER> (lquery:$ + (initialize *response*) + "title" + (text) + (each (lambda (text) + (cl-ppcre:scan-to-strings "Q\\w+" text)) + :replace t)) +#("Quotes") +``` +And this Python code: + +``` +>>> response.css("title::text").re(r"(\w+) to (\w+)") +['Quotes', 'Scrape'] +``` +becomes: + +``` +CL-USER> (lquery:$1 + (initialize *response*) + "title" + (text) + (map (lambda (text) + (nth-value 1 + (cl-ppcre:scan-to-strings "(\\w+) to (\\w+)" + text))))) +#("Quotes" "Scrape") +``` + + +### Extracting Quotes and Authors (step2.lisp) + +We already have first page's content in the `*response*` variable. Now let's extract quotes! + +Instead of this Python code: + +``` +response.css("div.quote") +[, +, +...] +``` +We can do this in Lisp: + +``` +CL-USER> (lquery:$ + (initialize *response*) + "div.quote") +#(# # + # # + # # + # #) +``` +Here is how we can to limit the number of items to not clutter the `REPL`. lQuery provides a `(function ...)` form where you can call any function you like. We'll use it to apply Serapeum's `take` function to cut only two first elements from the array of nodes: + +``` +CL-USER> (lquery:$ + (initialize *response*) + "div.quote" + (function + (lambda (nodes) + (serapeum:take 2 nodes)))) +#(# #) +``` +Now it is easy to add `(serialize)` form and preview the extracted pieces: + +``` +CL-USER> (lquery:$ + (initialize *response*) + "div.quote" + (function + (lambda (nodes) + (serapeum:take 2 nodes))) + (serialize)) +#("
+ “The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.” + by Albert Einstein + (about) + +
+ Tags: + > + + change + + deep-thoughts + + thinking + + world + +
+
" + "
+ “It is our choices, Harry, that show what we truly are, far more than our abilities.” + by J.K. Rowling + (about) + +
+ Tags: + > + + abilities + + choices + +
+
") +``` +Now let's extract the first quote. Instead of this code in Python which sequentilly extracts quote node and then it's subelements: + +``` +>>> text = quote.css("span.text::text").get() +>>> text +'“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”' +>>> author = quote.css("small.author::text").get() +>>> author +'Albert Einstein' +``` +We will use the power of functional approach and extract all needed data in a single pipeline using lQuery's form `(combine ...)`: + +``` +CL-USER> (lquery:$ + (initialize *response*) + "div.quote" + (function + (lambda (nodes) + (serapeum:take 2 nodes))) + (combine + (lquery:$1 + "span.text" + (text)) + (lquery:$1 + "small.author" + (text)))) +#(("“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”" + "Albert Einstein") + ("“It is our choices, Harry, that show what we truly are, far more than our abilities.”" + "J.K. Rowling")) +``` +Note, this code we put after the `serapeum:take 2`: + +``` +(combine + (lquery:$1 + "span.text" + (text)) + (lquery:$1 + "small.author" + (text)))) +``` +It allows us to extract two subelements of the `div.quote` node simultaneously, using function `combine`. These two pieces are combined into an array like: + +``` +#("“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”" + "Albert Einstein") +``` +But because of functional nature of lQuery, this `combine` operation is applied to all `div.quote` nodes on the page and we don't have to write explicit iteration loop. + +After that, original Scrapy's tutorial shows us how to extract tags list for each quote: + +``` +>>> tags = quote.css("div.tags a.tag::text").getall() +>>> tags +['change', 'deep-thoughts', 'thinking', 'world'] +``` +but knowing how does `combine` work, we can just add another rule into the `combine` form: + +``` +CL-USER> (lquery:$ + (initialize *response*) + "div.quote" + (function + (lambda (nodes) + (serapeum:take 2 nodes))) + (combine + (lquery:$1 + "span.text" + (text)) + (lquery:$1 + "small.author" + (text)) + (lquery:$ + "div.tags a.tag" + (text)))) +#(("“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”" + "Albert Einstein" #("change" "deep-thoughts" "thinking" "world")) + ("“It is our choices, Harry, that show what we truly are, far more than our abilities.”" + "J.K. Rowling" #("abilities" "choices"))) +``` +Note, for tags we are using `lquery:$` because there is a list of them. + +Scrapy's tutorial creates a hash table for each quote, but Scrapy`CL` framework's pipeline operates on `CLOS` objects. So, we'll create a separate `QUOTE-ITEM` class: + +``` +CL-USER> (defclass quote-item () + ((text :initarg :text + :type string + :reader quote-text) + (author :initarg :author + :type string + :reader quote-author) + (tags :initarg :tags + :type (serapeum:soft-list-of string) + :reader quote-tags))) +# + +CL-USER> (defmethod print-object ((obj quote-item) stream) + (print-unreadable-object (obj stream :type t) + (format stream "~A by ~A ~{#~A~^, ~}" + (quote-text obj) + (quote-author obj) + (quote-tags obj)))) +``` +Now we'll use `map-apply` to transform parsed data into these `CLOS` objects: + +``` +CL-USER> (lquery:$ + (initialize *response*) + "div.quote" + (function + (lambda (nodes) + (serapeum:take 2 nodes))) + (combine + (lquery:$1 + "span.text" + (text)) + (lquery:$1 + "small.author" + (text)) + (lquery:$ + "div.tags a.tag" + (text))) + (map-apply + (lambda (text author tags) + (make-instance 'quote-item + :text text + :author author + :tags (coerce tags 'list))))) +#(# + #) +``` +Put this piece of code into our method for [`scrapycl:process`][0482] generic-function: + +``` +CL-USER> (defmethod scrapycl:process ((spider quotes-spider) + (request quotes-page-request)) + (let ((data (scrapycl:fetch spider request))) + (lquery:$ + (initialize data) + "div.quote" + (combine + (lquery:$1 + "span.text" + (text)) + (lquery:$1 + "small.author" + (text)) + (lquery:$ + "div.tags a.tag" + (text))) + (map-apply + (lambda (text author tags) + (make-instance 'quote-item + :text text + :author author + :tags (coerce tags 'list))))))) +``` +And don't forget to remove this piece of code limiting the number of processed quotes: + +``` +(function + (lambda (nodes) + (serapeum:take 2 nodes))) +``` +we needed it only for a debug purpose. + +Now start our scraper: + +``` +CL-USER> (scrapycl:start (make-instance 'quotes-spider) :wait t) +(# + # +... +``` +As you can see, by default it returns a list of all items on the page, but in real world you will want this data to be saved or processed. In the next part we'll see how to do this. + + + +### Storing the Scraped Data + +Scrapy's tutorial shows this command as example on how to save scraped data to a json file: + +``` +scrapy crawl quotes -O quotes.json +``` +With Scrapy`CL` we can do the similar but with `OUTPUT` argument to the [`scrapycl:start`][24dc] generic-function and [`scrapycl:json-lines`][8b56] function: + +``` +CL-USER> (scrapycl:start (make-instance 'quotes-spider) + :wait t + :output (scrapycl:json-lines #P"items.json")) +``` +And content of `items.json` file will look like: + +```json +{"text":"“A day without sunshine is like, you know, night.”","author":"Steve Martin","tags":["humor","obvious","simile"]} +{"text":"“A woman is like a tea bag; you never know how strong it is until it's in hot water.”","author":"Eleanor Roosevelt","tags":["misattributed-eleanor-roosevelt"]} +``` +Each object is on it's own line in a [JsonLines][2490] format. If you want to get a `JSON` file with a list, then use [`scrapycl:json-list`][2cad] instead. Or use [`scrapycl:json-dict`][8baf] to get a file with `JSON` object. But beware, these two outputs can't work in `:APPEND` mode. + + + +#### Custom processing + +Actually, `OUTPUT` argument accepts any lisp function. The only requirements are: + +* This function should accept a single argument. The objects for which there is no specialized method of [`scrapycl:process`][0482] generic-function will be passed into this function. +* It should accept `SCRAPYCL:STOP-OUTPUT` symbol and flush buffer, closing a file or a transaction, because this symbol is sent when all hrefs were processed and there is no more data to process. + + + +### Following the Links + +In Scrapy framework for following links you should yield a request object like this: + +``` +yield scrapy.Request(next_page, callback=self.parse) +``` +With Scrapy`CL` to follow links, you need to return new request objects from your method for [`scrapycl:process`][0482] generic-function. And because you are returning an object of a customized request class, you can add more data slots to the request to make this additional data available during request processing. For example, such data might include a parent category of the page or an some piece of data available on other page. + +Scrapy's test site contains quotes and their authors. Let's make our scraper parse not only quotes but also their authors. See `tutorial/step3.lisp` file for the full code for this part. + +First, we need to add an `AUTHOR-ITEM` class: + +``` +CL-USER> (defclass author-item () + ((name :initarg :name + :type string + :reader author-name) + (birthday :initarg :birthday + :type string + :reader author-birthday) + (bio :initarg :bio + :type string + :reader author-bio))) +# + +CL-USER> (defmethod print-object ((obj author-item) stream) + (print-unreadable-object (obj stream :type t) + (format stream "~A" + (author-name obj)))) +# +``` +Now let's make our spider do follow links leading to the next page and to the authors pages. + +Here is how we can extract the link to the next page: + +``` +CL-USER> (lquery:$1 + (initialize *response*) + "ul.pager a" + (attr "href")) +"/page/2/" +``` +But we need an absolute `URL` for request. So we have to merge this path with a base `URL`. + +[`scrapycl:fetch`][1128] generic-function returns current page's real `URL` as a second value. Also, Scrapy`CL` provides a `MERGE-WITH-URL` lquery form. Together they can be used like this: + +``` +CL-USER> (multiple-value-bind (response base-url) + (scrapycl:fetch (make-instance 'quotes-spider) + (make-instance 'quotes-page-request + :url "https://quotes.toscrape.com/")) + (lquery:$1 + (initialize response) + "ul.pager a" + (attr "href") + (merge-url-with base-url))) +"https://quotes.toscrape.com/page/2/" +``` +It is better to use `URL` returned by [`scrapycl:fetch`][1128] generic-function because of these two reasons: + +* This `URL` can differ from original request `URL` because site might redirect request to the other page. +* Href attributes on the page can be relative, like `../quotes/page/1` and will not work if you'll hardcode base url. + +Let's figure out which author pages should be followed. Original Scrapy tutorial uses this `CSS` selector `.author + a`, but lquery does not support `+` selector. To find a siblings of `.author` element but we can use `NEXT` form to select subsequent element following the `author` node: + +``` +CL-USER> (multiple-value-bind (response base-url) + (scrapycl:fetch (make-instance 'quotes-spider) + (make-instance 'quotes-page-request + :url "https://quotes.toscrape.com/")) + (lquery:$ + (initialize response) + ".author" + (next "a") + (attr "href") + (merge-url-with base-url))) +#("https://quotes.toscrape.com/author/Albert-Einstein" + "https://quotes.toscrape.com/author/J-K-Rowling" + "https://quotes.toscrape.com/author/Albert-Einstein" + "https://quotes.toscrape.com/author/Jane-Austen" + "https://quotes.toscrape.com/author/Marilyn-Monroe" + "https://quotes.toscrape.com/author/Albert-Einstein" + "https://quotes.toscrape.com/author/Andre-Gide" + "https://quotes.toscrape.com/author/Thomas-A-Edison" + "https://quotes.toscrape.com/author/Eleanor-Roosevelt" + "https://quotes.toscrape.com/author/Steve-Martin") +``` +Ok, now, when we have a `URL`s to follow, let's modify our processing function to return them as new requests: + +``` +CL-USER> (defclass author-page-request (scrapycl:request) + ()) +# + + +CL-USER> (defmethod scrapycl:process ((spider quotes-spider) + (request quotes-page-request)) + (multiple-value-bind (data base-url) + (scrapycl:fetch spider request) + (log:info "Fetched" base-url) + + (let ((quotes (lquery:$ + (initialize data) + "div.quote" + (combine + (lquery:$1 + "span.text" + (text)) + (lquery:$1 + "small.author" + (text)) + (lquery:$ + "div.tags a.tag" + (text))) + (map-apply + (lambda (text author tags) + (make-instance 'quote-item + :text text + :author author + :tags (coerce tags 'list)))))) + (next-page-url (lquery:$1 + (initialize data) + "ul.pager a" + (attr "href") + (merge-url-with base-url))) + (author-urls (lquery:$ + (initialize data) + ".author" + (next "a") + (attr "href") + (merge-url-with base-url)))) + ;; Now return objects and new requests + (list quotes + (map 'list (lambda (url) + (make-instance 'author-page-request + :url url)) + author-urls) + (when next-page-url + (make-instance 'quotes-page-request + :url next-page-url)))))) +# +``` +We return objects of three types from this processing method: quote-items, quotes-page-requests and author-page-requests. + +Now if we will run our scraper, then we'll see it walks only through quotes pages and ignores author pages: + +``` +CL-USER> (scrapycl:start (make-instance 'quotes-spider) + :wait t + :output (scrapycl:json-lines #P"items.json")) + [19:25:21] cl-user (process quotes-spider quotes-page-request) - + Fetched BASE-URL: # + + [19:25:21] cl-user (process quotes-spider quotes-page-request) - + Fetched BASE-URL: # + +NIL +``` +But in the file `items.json` you might see interesting records: + +``` +{"url":"https://quotes.toscrape.com/author/Steve-Martin"} +{"url":"https://quotes.toscrape.com/author/Eleanor-Roosevelt"} +{"url":"https://quotes.toscrape.com/author/Thomas-A-Edison"} +{"url":"https://quotes.toscrape.com/author/Andre-Gide"} +... +``` +This is because we forgot to define a processing method for our class `AUTHOR-PAGE-REQUEST`. Scrapy`CL` sees objects without a processing method and decides these are final objects to be serialized to the output. Let's write a method to extract information about authors as well. + +Here I've just translated these Python rules: + +``` +def parse_author(self, response): + def extract_with_css(query): + return response.css(query).get(default="").strip() + + yield { + "name": extract_with_css("h3.author-title::text"), + "birthdate": extract_with_css(".author-born-date::text"), + "bio": extract_with_css(".author-description::text"), + } +``` +into the lquery `DSL`: + +``` +CL-USER> (multiple-value-bind (response) + (scrapycl:fetch (make-instance 'quotes-spider) + (make-instance 'author-page-request + :url "https://quotes.toscrape.com/author/Thomas-A-Edison")) + (lquery:$1 + (initialize response) + (combine + (lquery:$1 + "h3.author-title" + (text)) + (lquery:$1 + ".author-born-date" + (text)) + (lquery:$1 + ".author-description" + (text) + (map #'str:trim))))) + +("Thomas A. Edison" "February 11, 1847" + "Thomas Alva Edison was an American inventor, scientist and businessman who developed many devices that greatly influenced life around the world, including the phonograph, the motion picture camera, and a long-lasting, practical electric light bulb. Dubbed \"The Wizard of Menlo Park\" (now Edison, New Jersey) by a newspaper reporter, he was one of the first inventors to apply the principles of mass production and large teamwork to the process of invention, and therefore is often credited with the creation of the first industrial research laboratory.Edison is considered one of the most prolific inventors in history, holding 1,093 U.S. patents in his name, as well as many patents in the United Kingdom, France and Germany. He is credited with numerous inventions that contributed to mass communication and, in particular, telecommunications. His advanced work in these fields was an outgrowth of his early career as a telegraph operator. Edison originated the concept and implementation of electric-power generation and distribution to homes, businesses, and factories – a crucial development in the modern industrialized world. His first power station was on Manhattan Island, New York.") +``` +And here is the full processing method which will return an author object: + +``` +CL-USER> (defmethod scrapycl:process ((spider quotes-spider) + (request author-page-request)) + (multiple-value-bind (data base-url) + (scrapycl:fetch spider request) + (log:info "Fetched" base-url) + + (lquery:$1 + (initialize data) + (combine + (lquery:$1 + "h3.author-title" + (text)) + (lquery:$1 + ".author-born-date" + (text)) + (lquery:$1 + ".author-description" + (text) + (map #'str:trim))) + (map-apply + (lambda (name birthday bio) + (make-instance 'author-item + :name name + :birthday birthday + :bio bio)))))) + +# +``` +Now, if you start the our spider again, you'll get quotes and authors mixed in the same `items.json` file. + +But how to put different kinds of object into a different output files? + +This is easy - just use a [`scrapycl:typed-output`][ae6a] function. This kind of output redirects items into another outputs depending on their type. + +To separate output into `quotes.json` and `authors.json`, execute our scraper like this: + +``` +CL-USER> (scrapycl:start (make-instance 'quotes-spider) + :wait t + :output (scrapycl:typed-output + (list (cons 'quote-item + (scrapycl:json-lines #P"quotes.json")) + (cons 'author-item + (scrapycl:json-lines #P"authors.json"))))) + [19:29:43] cl-user (process quotes-spider quotes-page-request) - + Fetched BASE-URL: # + + [19:29:43] cl-user (process quotes-spider quotes-page-request) - + Fetched BASE-URL: # + + ... + + [19:29:48] cl-user (process quotes-spider quotes-page-request) - + Fetched BASE-URL: # + +NIL +``` +It will save each type of item in a separate file. + +I hope this little introduction will urge you to try Scrapy`CL` for writing your own data scrapers! Feel free to share your ideas on the project's [discussions page][f9c2]. @@ -46,67 +839,87 @@ You can install this library from Quicklisp, but you want to receive updates qui -#### [package](caca) `scrapycl` +#### [package](b0b2) `scrapycl` #### Classes + + +##### FETCH-ERROR + + + +###### [condition](5010) `scrapycl:fetch-error` (scrapycl-error) + +This condition is signalled when [`scrapycl:fetch`][1128] generic-function gets non 200 status code. + ##### REQUEST -###### [class](7bc4) `scrapycl:request` () +###### [class](4dbf) `scrapycl:request` () **Readers** - + -###### [reader](8391) `scrapycl/request:request-url` (request) (:URL = (ERROR "Please, provide :URL argument.")) +###### [reader](9736) `scrapycl:request-url` (request) (:URL = (ERROR "Please, provide :URL argument.")) `URL` to fetch data from. + + +##### SCRAPYCL-ERROR + + + +###### [condition](3573) `scrapycl:scrapycl-error` (error) + +Base class for all Scrapy`CL` errors. + ##### SPIDER -###### [class](8ccb) `scrapycl:spider` () +###### [class](14ae) `scrapycl:spider` () **Readers** -###### [reader](6531) `scrapycl/spider::%initial-requests` (spider) (:initial-requests = nil) +###### [reader](3f4b) `scrapycl/spider::%initial-requests` (spider) (:initial-requests = nil) -###### [reader](8336) `scrapycl/spider::%spider-queue` (spider) (= nil) +###### [reader](f208) `scrapycl/spider::%spider-queue` (spider) (= nil) -###### [reader](af53) `scrapycl/spider::%spider-queue-lock` (spider) (= (MAKE-LOCK :NAME "Scrapycl Queue Lock")) +###### [reader](8b95) `scrapycl/spider::%spider-queue-lock` (spider) (= (MAKE-LOCK :NAME "Scrapycl Queue Lock")) -###### [reader](4003) `scrapycl/spider::%spider-thread` (spider) (= nil) +###### [reader](fd2b) `scrapycl/spider::%spider-thread` (spider) (= nil) **Accessors** -###### [accessor](8336) `scrapycl/spider::%spider-queue` (spider) (= nil) +###### [accessor](f208) `scrapycl/spider::%spider-queue` (spider) (= nil) -###### [accessor](af53) `scrapycl/spider::%spider-queue-lock` (spider) (= (MAKE-LOCK :NAME "Scrapycl Queue Lock")) +###### [accessor](8b95) `scrapycl/spider::%spider-queue-lock` (spider) (= (MAKE-LOCK :NAME "Scrapycl Queue Lock")) -###### [accessor](4003) `scrapycl/spider::%spider-thread` (spider) (= nil) +###### [accessor](fd2b) `scrapycl/spider::%spider-thread` (spider) (= nil) @@ -114,110 +927,98 @@ You can install this library from Quicklisp, but you want to receive updates qui -##### [generic-function](20d0) `scrapycl:fetch` spider request &key max-redirects timeout custom-headers +##### [generic-function](ce60) `scrapycl:fetch` spider request &key max-redirects timeout custom-headers - - -##### [generic-function](efc3) `scrapycl:process` spider object - - - -##### [generic-function](1d4c) `scrapycl:start` spider &key wait output &allow-other-keys - - - -#### Functions - - - -##### [function](e0f5) `scrapycl:enqueue` spider object &key (output-func nil scrapycl/engine::output-func-p) - - - -### SCRAPYCL/ERRORS - - +Fetches page from request's `URL`. -#### [package](fa3c) `scrapycl/errors` +Returns a multiple values: - +* A string with `HTML` response. +* `URL` from which response was received. Might be different from original `URL` because of redirects. +* A hash-table with reponse `HTTP` headers. -#### Classes - - - -##### FETCH-ERROR - - - -###### [condition](ece9) `scrapycl/errors:fetch-error` (scrapycl-error) + - +##### [generic-function](a378) `scrapycl:process` spider object -##### SCRAPYCL-ERROR + - +##### [generic-function](7c50) `scrapycl:start` spider &key wait output &allow-other-keys -###### [condition](eb68) `scrapycl/errors:scrapycl-error` (error) + - +##### [generic-function](97b7) `scrapycl:write-as-json` object stream -### SCRAPYCL/REQUEST + - +#### Functions -#### [package](27de) `scrapycl/request` + - +##### [function](d4e7) `scrapycl:enqueue` spider object &key (output-func nil scrapycl/engine::output-func-p) -#### Generics + - +##### [function](379a) `scrapycl:json-dict` FILENAME &KEY (KEY "items") -##### [generic-function] `scrapycl/request:request-url` object +Creates an "output" callback for serializing objects as a list inside a `JSON` dictionary. - + -### SCRAPYCL/UTILS +##### [function](bc23) `scrapycl:json-lines` filename &key (if-exists :supersede) - + -#### [package](5511) `scrapycl/utils` +##### [function](3f3a) `scrapycl:json-list` filename - + -#### Functions +##### [function](5c8a) `scrapycl:preview` nodes - + -##### [function](147a) `scrapycl/utils:preview` nodes +##### [function](0bd2) `scrapycl:typed-output` type-to-output-alist [174b]: https://40ants.com/scrapycl/ +[1128]: https://40ants.com/scrapycl/#x-28SCRAPYCL-3AFETCH-20GENERIC-FUNCTION-29 +[8baf]: https://40ants.com/scrapycl/#x-28SCRAPYCL-3AJSON-DICT-20FUNCTION-29 +[8b56]: https://40ants.com/scrapycl/#x-28SCRAPYCL-3AJSON-LINES-20FUNCTION-29 +[2cad]: https://40ants.com/scrapycl/#x-28SCRAPYCL-3AJSON-LIST-20FUNCTION-29 +[0482]: https://40ants.com/scrapycl/#x-28SCRAPYCL-3APROCESS-20GENERIC-FUNCTION-29 +[dcea]: https://40ants.com/scrapycl/#x-28SCRAPYCL-3ASPIDER-20CLASS-29 +[24dc]: https://40ants.com/scrapycl/#x-28SCRAPYCL-3ASTART-20GENERIC-FUNCTION-29 +[ae6a]: https://40ants.com/scrapycl/#x-28SCRAPYCL-3ATYPED-OUTPUT-20FUNCTION-29 +[b2a5]: https://docs.scrapy.org/en/latest/intro/tutorial.html [66a8]: https://github.com/40ants/scrapycl [5d38]: https://github.com/40ants/scrapycl/actions -[caca]: https://github.com/40ants/scrapycl/blob/49257e64a9f86ff94a98564e5a738fcd8f121723/src/core.lisp#L1 -[20d0]: https://github.com/40ants/scrapycl/blob/49257e64a9f86ff94a98564e5a738fcd8f121723/src/downloader.lisp#L16 -[e0f5]: https://github.com/40ants/scrapycl/blob/49257e64a9f86ff94a98564e5a738fcd8f121723/src/engine.lisp#L106 -[efc3]: https://github.com/40ants/scrapycl/blob/49257e64a9f86ff94a98564e5a738fcd8f121723/src/engine.lisp#L129 -[fa3c]: https://github.com/40ants/scrapycl/blob/49257e64a9f86ff94a98564e5a738fcd8f121723/src/errors.lisp#L1 -[eb68]: https://github.com/40ants/scrapycl/blob/49257e64a9f86ff94a98564e5a738fcd8f121723/src/errors.lisp#L10 -[ece9]: https://github.com/40ants/scrapycl/blob/49257e64a9f86ff94a98564e5a738fcd8f121723/src/errors.lisp#L14 -[27de]: https://github.com/40ants/scrapycl/blob/49257e64a9f86ff94a98564e5a738fcd8f121723/src/request.lisp#L1 -[7bc4]: https://github.com/40ants/scrapycl/blob/49257e64a9f86ff94a98564e5a738fcd8f121723/src/request.lisp#L10 -[8391]: https://github.com/40ants/scrapycl/blob/49257e64a9f86ff94a98564e5a738fcd8f121723/src/request.lisp#L11 -[8ccb]: https://github.com/40ants/scrapycl/blob/49257e64a9f86ff94a98564e5a738fcd8f121723/src/spider.lisp#L24 -[8336]: https://github.com/40ants/scrapycl/blob/49257e64a9f86ff94a98564e5a738fcd8f121723/src/spider.lisp#L25 -[af53]: https://github.com/40ants/scrapycl/blob/49257e64a9f86ff94a98564e5a738fcd8f121723/src/spider.lisp#L27 -[4003]: https://github.com/40ants/scrapycl/blob/49257e64a9f86ff94a98564e5a738fcd8f121723/src/spider.lisp#L29 -[6531]: https://github.com/40ants/scrapycl/blob/49257e64a9f86ff94a98564e5a738fcd8f121723/src/spider.lisp#L31 -[1d4c]: https://github.com/40ants/scrapycl/blob/49257e64a9f86ff94a98564e5a738fcd8f121723/src/spider.lisp#L37 -[5511]: https://github.com/40ants/scrapycl/blob/49257e64a9f86ff94a98564e5a738fcd8f121723/src/utils.lisp#L1 -[147a]: https://github.com/40ants/scrapycl/blob/49257e64a9f86ff94a98564e5a738fcd8f121723/src/utils.lisp#L80 +[b0b2]: https://github.com/40ants/scrapycl/blob/58729e4f355e0efa3ce34264dea49736f42c50ea/src/core.lisp#L1 +[ce60]: https://github.com/40ants/scrapycl/blob/58729e4f355e0efa3ce34264dea49736f42c50ea/src/downloader.lisp#L16 +[d4e7]: https://github.com/40ants/scrapycl/blob/58729e4f355e0efa3ce34264dea49736f42c50ea/src/engine.lisp#L105 +[a378]: https://github.com/40ants/scrapycl/blob/58729e4f355e0efa3ce34264dea49736f42c50ea/src/engine.lisp#L128 +[3573]: https://github.com/40ants/scrapycl/blob/58729e4f355e0efa3ce34264dea49736f42c50ea/src/errors.lisp#L10 +[5010]: https://github.com/40ants/scrapycl/blob/58729e4f355e0efa3ce34264dea49736f42c50ea/src/errors.lisp#L15 +[bc23]: https://github.com/40ants/scrapycl/blob/58729e4f355e0efa3ce34264dea49736f42c50ea/src/output/json.lisp#L112 +[3f3a]: https://github.com/40ants/scrapycl/blob/58729e4f355e0efa3ce34264dea49736f42c50ea/src/output/json.lisp#L123 +[379a]: https://github.com/40ants/scrapycl/blob/58729e4f355e0efa3ce34264dea49736f42c50ea/src/output/json.lisp#L145 +[97b7]: https://github.com/40ants/scrapycl/blob/58729e4f355e0efa3ce34264dea49736f42c50ea/src/output/json.lisp#L19 +[0bd2]: https://github.com/40ants/scrapycl/blob/58729e4f355e0efa3ce34264dea49736f42c50ea/src/output/typed.lisp#L16 +[4dbf]: https://github.com/40ants/scrapycl/blob/58729e4f355e0efa3ce34264dea49736f42c50ea/src/request.lisp#L11 +[9736]: https://github.com/40ants/scrapycl/blob/58729e4f355e0efa3ce34264dea49736f42c50ea/src/request.lisp#L12 +[14ae]: https://github.com/40ants/scrapycl/blob/58729e4f355e0efa3ce34264dea49736f42c50ea/src/spider.lisp#L24 +[f208]: https://github.com/40ants/scrapycl/blob/58729e4f355e0efa3ce34264dea49736f42c50ea/src/spider.lisp#L25 +[8b95]: https://github.com/40ants/scrapycl/blob/58729e4f355e0efa3ce34264dea49736f42c50ea/src/spider.lisp#L27 +[fd2b]: https://github.com/40ants/scrapycl/blob/58729e4f355e0efa3ce34264dea49736f42c50ea/src/spider.lisp#L29 +[3f4b]: https://github.com/40ants/scrapycl/blob/58729e4f355e0efa3ce34264dea49736f42c50ea/src/spider.lisp#L31 +[7c50]: https://github.com/40ants/scrapycl/blob/58729e4f355e0efa3ce34264dea49736f42c50ea/src/spider.lisp#L37 +[5c8a]: https://github.com/40ants/scrapycl/blob/58729e4f355e0efa3ce34264dea49736f42c50ea/src/utils.lisp#L81 +[f9c2]: https://github.com/40ants/scrapycl/discussions [1e8e]: https://github.com/40ants/scrapycl/issues +[2490]: https://jsonlines.org/ [2c00]: https://quickdocs.org/40ants-doc [8236]: https://quickdocs.org/alexandria [3dbf]: https://quickdocs.org/bordeaux-threads +[61a4]: https://quickdocs.org/closer-mop [8347]: https://quickdocs.org/dexador [7f8b]: https://quickdocs.org/log4cl [691c]: https://quickdocs.org/log4cl-extras @@ -226,6 +1027,8 @@ You can install this library from Quicklisp, but you want to receive updates qui [c41d]: https://quickdocs.org/serapeum [8175]: https://quickdocs.org/spinneret [ef7f]: https://quickdocs.org/str +[aba2]: https://quickdocs.org/yason +[8cd8]: https://shinmera.github.io/lquery/ * * * ###### [generated by [40ANTS-DOC](https://40ants.com/doc/)] diff --git a/docs/changelog.lisp b/docs/changelog.lisp index 56ddd01..952a998 100644 --- a/docs/changelog.lisp +++ b/docs/changelog.lisp @@ -9,5 +9,5 @@ "ASDF" "REPL" "HTTP")) - (0.1.0 2023-02-05 + (0.1.0 2024-03-17 "* Initial version.")) diff --git a/docs/index.lisp b/docs/index.lisp index 349f903..882ee27 100644 --- a/docs/index.lisp +++ b/docs/index.lisp @@ -15,6 +15,8 @@ #:docs-config) (:import-from #:40ants-doc/autodoc #:defautodoc) + (:import-from #:scrapycl-docs/tutorial + #:@tutorial) (:export #:@index #:@readme #:@changelog)) @@ -34,13 +36,14 @@ (list :theme (find-symbol "40ANTS-THEME" - (find-package "40ANTS-DOC-THEME-40ANTS"))) - ) + (find-package "40ANTS-DOC-THEME-40ANTS")))) (defsection @index (:title "scrapycl - The web scraping framework for writing crawlers in Common Lisp." :ignore-words ("JSON" "HTTP" + "HTML" + "CL" "TODO" "Unlicense" "REPL" @@ -60,7 +63,7 @@ ![Quicklisp](http://quickdocs.org/badge/scrapycl.svg) " (@installation section) - (@usage section) + (@tutorial section) (@api section)) @@ -79,10 +82,4 @@ You can install this library from Quicklisp, but you want to receive updates qui """) -(defsection @usage (:title "Usage") - " -TODO: Write a library description. Put some examples here. -") - - (defautodoc @api (:system "scrapycl")) diff --git a/docs/tutorial.lisp b/docs/tutorial.lisp new file mode 100644 index 0000000..3aebc0a --- /dev/null +++ b/docs/tutorial.lisp @@ -0,0 +1,892 @@ +(uiop:define-package #:scrapycl-docs/tutorial + (:use #:cl) + (:import-from #:40ants-doc + #:defsection) + (:import-from #:pythonic-string-reader + #:pythonic-string-syntax) + (:import-from #:named-readtables + #:in-readtable)) +(in-package #:scrapycl-docs/tutorial) + +(in-readtable pythonic-string-syntax) + + +(defsection @tutorial (:title "Tutorial" + :ignore-words ("CSS" + "CLOS" + "SCRAPYCL:STOP-OUTPUT" + "NEXT" + "AUTHOR-PAGE-REQUEST" + "DSL" + "AUTHOR-ITEM" + "QUOTE-ITEM" + "MERGE-WITH-URL")) + (@intro section) + (@our-first-scraper section) + (@extracting-data section) + (@extracting-quotes-and-authors section) + (@storing-data section) + (@following-links section)) + + +(defsection @intro (:title "Introduction") + """ +In this tutorial we'll train our parsing skill on this toy site: https://quotes.toscrape.com/. +We will follow [Scrapy's tutorial](https://docs.scrapy.org/en/latest/intro/tutorial.html) and see if we can get all the data using ScrapyCL. + +You will find whole code for this tutorial in the `tutorial/` folder. + + +Firstly Scrapy tutorial shows us how to experiment with HTTP response in the REPL. But with Common Lisp we have much more sofisticated REPL out of the box. So we skip this step: + +``` +scrapy shell "https://quotes.toscrape.com/page/1/" +``` + +Right to the Common Lisp REPL! +""") + + +(defsection @our-first-scraper (:title "Our First Scraper") + """ +ScrapyCL is built around CLOS. Every scraping pipeline in this framework operates on CLOS objects. +Most generic-functions accept a SCRAPYCL:SPIDER object as a first argument. Also, requests to HTML pages are typed. +This way you are telling to the framework how each page should be processed. + +First thing we need to do is to define a class of the request to a page with quotes: + +```lisp +CL-USER> (defclass quotes-page-request (scrapycl:request) + ()) +# + +``` + +Next, we define a class for the spider: + +```lisp +CL-USER> (defclass quotes-spider (scrapycl:spider) + () + (:default-initargs + :initial-requests (list (make-instance 'quotes-page-request + :url "https://quotes.toscrape.com/page/1/") + (make-instance 'quotes-page-request + :url "https://quotes.toscrape.com/page/2/")))) +``` + +Here we tell the spider to start from two initial pages. + +Now it is time to make our first HTTP request and to see content of the page. +I'll save the page's content to a variable to be able to play with parsing. + + +```lisp +CL-USER> (defparameter *response* + (scrapycl:fetch (make-instance 'quotes-spider) + (make-instance 'quotes-page-request + :url "https://quotes.toscrape.com/page/1/"))) +*RESPONSE* +CL-USER> *response* +" + + + + Quotes to Scrape + + (scrapycl:start (make-instance 'quotes-spider) :wait t) +(# + #) +``` + +It returns initial page requests as is because we didn't write a method for SCRAPYCL:PROCESS generic-function. Now we'll define it to save content into the files: + +``` +CL-USER> (defmethod scrapycl:process ((spider quotes-spider) + (request quotes-page-request)) + (multiple-value-bind (data url) + (scrapycl:fetch spider request) + (let* ((page-number (third (str:split "/" (quri:uri-path url)))) + (filename (format nil "quotes-~A.html" page-number))) + (alexandria:write-string-into-file data filename + :if-exists :supersede) + (log:info "Page saved to" filename) + ;; return nothing, to stop processing + (values)))) +# +``` + +Next attempt to start the scraper will output information that data was saved to the files: + +``` +CL-USER> (scrapycl:start (make-instance 'quotes-spider) :wait t) + [18:30:37] cl-user (process quotes-spider quotes-page-request) - + Page saved to FILENAME: "quotes-2.html" + [18:30:37] cl-user (process quotes-spider quotes-page-request) - + Page saved to FILENAME: "quotes-1.html" +NIL +``` + +Now it is time to extract useful information out from these HTML pages. +""" + ) + + +(defsection @extracting-data (:title "Extracting the Data") + """ + Where Scrapy shows iPython REPL: + +``` +>>> response.css("title::text").getall() +['Quotes to Scrape'] +``` + +We have a full-featured Common Lisp REPL. For HTML parsing we'll use great [lQuery library](https://shinmera.github.io/lquery/). Here is how we can reproduce python code in Lisp using lquery DSL: + + +``` +CL-USER> (lquery:$ + (initialize *response*) + "title" + (text)) +#("Quotes to Scrape") +``` + +lQuery is parses data in a functional way. It has [amazing documentation](https://shinmera.github.io/lquery/). Take a moment and read it to understand the basic principles. + +Then Python tutorial shows us what `getall` method returns: + +``` +response.css("title").getall() +['Quotes to Scrape'] +``` + +With lisp we could do the same. Just drop `(text)` form at the end of the lquery pipeline: + +``` +CL-USER> (lquery:$ + (initialize *response*) + "title") +#(#) +``` + +But this code returns us a set of HTML nodes. If you want to see the actual HTML code behind it, +use `(serialize)` form: + +``` +CL-USER> (lquery:$ + (initialize *response*) + "title" + (serialize)) +#("Quotes to Scrape") +``` + +To get only a single item in Python you use `get` method instead of `getall`: + +``` +>>> response.css("title::text").get() +'Quotes to Scrape' +``` + +In Lisp use `lquery:$1` macro instead of `lquery:$`: + +``` +CL-USER> (lquery:$1 + (initialize *response*) + "title" + (text)) +"Quotes to Scrape" +``` + +You see, it returns a single object instead of an array! + +As an alternative, you could’ve written in Python: + +``` +>>> response.css("title::text")[0].get() +'Quotes to Scrape' +``` + +In Lisp we can do the same: + +``` +CL-USER> (let* ((nodes (lquery:$ + (initialize *response*) + "title")) + (title-node (elt nodes 0))) + (plump:text title-node)) +"Quotes to Scrape" +``` + +There is no analogue to `re` from Scrapy in lQuery: + +``` +>>> response.css("title::text").re(r"Quotes.*") +['Quotes to Scrape'] +``` + +but you can use a filter function: + +``` +CL-USER> (lquery:$ + (initialize *response*) + "title" + (text) + (filter (lambda (text) + (cl-ppcre:scan "Quotes.*" text)))) +#("Quotes to Scrape") +``` + +Another example from Python code: + +``` +>>> response.css("title::text").re(r"Q\w+") +['Quotes'] +``` + +Becomes in Lisp: + +``` +CL-USER> (lquery:$ + (initialize *response*) + "title" + (text) + (each (lambda (text) + (cl-ppcre:scan-to-strings "Q\\w+" text)) + :replace t)) +#("Quotes") +``` + + +And this Python code: + +``` +>>> response.css("title::text").re(r"(\w+) to (\w+)") +['Quotes', 'Scrape'] +``` + +becomes: + + +``` +CL-USER> (lquery:$1 + (initialize *response*) + "title" + (text) + (map (lambda (text) + (nth-value 1 + (cl-ppcre:scan-to-strings "(\\w+) to (\\w+)" + text))))) +#("Quotes" "Scrape") +``` + """) + + +(defsection @extracting-quotes-and-authors (:title "Extracting Quotes and Authors (step2.lisp)") + """ + We already have first page's content in the `*response*` variable. Now let's extract quotes! + +Instead of this Python code: + +``` +response.css("div.quote") +[, +, +...] +``` + +We can do this in Lisp: + +``` +CL-USER> (lquery:$ + (initialize *response*) + "div.quote") +#(# # + # # + # # + # #) +``` + +Here is how we can to limit the number of items to not clutter the REPL. lQuery provides a `(function ...)` form where you can call any function you like. We'll use it to apply Serapeum's `take` function to cut only two first elements from the array of nodes: + +``` +CL-USER> (lquery:$ + (initialize *response*) + "div.quote" + (function + (lambda (nodes) + (serapeum:take 2 nodes)))) +#(# #) +``` + +Now it is easy to add `(serialize)` form and preview the extracted pieces: + +``` +CL-USER> (lquery:$ + (initialize *response*) + "div.quote" + (function + (lambda (nodes) + (serapeum:take 2 nodes))) + (serialize)) +#("
+ “The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.” + by Albert Einstein + (about) + +
+ Tags: + > + + change + + deep-thoughts + + thinking + + world + +
+
" + "
+ “It is our choices, Harry, that show what we truly are, far more than our abilities.” + by J.K. Rowling + (about) + +
+ Tags: + > + + abilities + + choices + +
+
") +``` + +Now let's extract the first quote. Instead of this code in Python which sequentilly extracts quote node and then it's subelements: + +``` +>>> text = quote.css("span.text::text").get() +>>> text +'“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”' +>>> author = quote.css("small.author::text").get() +>>> author +'Albert Einstein' +``` + +We will use the power of functional approach and extract all needed data in a single pipeline using lQuery's form `(combine ...)`: + +``` +CL-USER> (lquery:$ + (initialize *response*) + "div.quote" + (function + (lambda (nodes) + (serapeum:take 2 nodes))) + (combine + (lquery:$1 + "span.text" + (text)) + (lquery:$1 + "small.author" + (text)))) +#(("“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”" + "Albert Einstein") + ("“It is our choices, Harry, that show what we truly are, far more than our abilities.”" + "J.K. Rowling")) +``` + +Note, this code we put after the `serapeum:take 2`: + +``` +(combine + (lquery:$1 + "span.text" + (text)) + (lquery:$1 + "small.author" + (text)))) +``` + +It allows us to extract two subelements of the `div.quote` node simultaneously, using function `combine`. These two pieces are combined into an array like: + +``` +#("“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”" + "Albert Einstein") +``` + +But because of functional nature of lQuery, this `combine` operation is applied to all `div.quote` nodes on the page and we don't have to write explicit iteration loop. + +After that, original Scrapy's tutorial shows us how to extract tags list for each quote: + +``` +>>> tags = quote.css("div.tags a.tag::text").getall() +>>> tags +['change', 'deep-thoughts', 'thinking', 'world'] +``` + +but knowing how does `combine` work, we can just add another rule into the `combine` form: + +``` +CL-USER> (lquery:$ + (initialize *response*) + "div.quote" + (function + (lambda (nodes) + (serapeum:take 2 nodes))) + (combine + (lquery:$1 + "span.text" + (text)) + (lquery:$1 + "small.author" + (text)) + (lquery:$ + "div.tags a.tag" + (text)))) +#(("“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”" + "Albert Einstein" #("change" "deep-thoughts" "thinking" "world")) + ("“It is our choices, Harry, that show what we truly are, far more than our abilities.”" + "J.K. Rowling" #("abilities" "choices"))) +``` + +Note, for tags we are using `lquery:$` because there is a list of them. + +Scrapy's tutorial creates a hash table for each quote, but ScrapyCL framework's pipeline operates on CLOS objects. So, we'll create a separate `QUOTE-ITEM` class: + +``` +CL-USER> (defclass quote-item () + ((text :initarg :text + :type string + :reader quote-text) + (author :initarg :author + :type string + :reader quote-author) + (tags :initarg :tags + :type (serapeum:soft-list-of string) + :reader quote-tags))) +# + +CL-USER> (defmethod print-object ((obj quote-item) stream) + (print-unreadable-object (obj stream :type t) + (format stream "~A by ~A ~{#~A~^, ~}" + (quote-text obj) + (quote-author obj) + (quote-tags obj)))) +``` + +Now we'll use `map-apply` to transform parsed data into these CLOS objects: + +``` +CL-USER> (lquery:$ + (initialize *response*) + "div.quote" + (function + (lambda (nodes) + (serapeum:take 2 nodes))) + (combine + (lquery:$1 + "span.text" + (text)) + (lquery:$1 + "small.author" + (text)) + (lquery:$ + "div.tags a.tag" + (text))) + (map-apply + (lambda (text author tags) + (make-instance 'quote-item + :text text + :author author + :tags (coerce tags 'list))))) +#(# + #) +``` + +Put this piece of code into our method for SCRAPYCL:PROCESS generic-function: + +``` +CL-USER> (defmethod scrapycl:process ((spider quotes-spider) + (request quotes-page-request)) + (let ((data (scrapycl:fetch spider request))) + (lquery:$ + (initialize data) + "div.quote" + (combine + (lquery:$1 + "span.text" + (text)) + (lquery:$1 + "small.author" + (text)) + (lquery:$ + "div.tags a.tag" + (text))) + (map-apply + (lambda (text author tags) + (make-instance 'quote-item + :text text + :author author + :tags (coerce tags 'list))))))) +``` + +And don't forget to remove this piece of code limiting the number of processed quotes: + +``` +(function + (lambda (nodes) + (serapeum:take 2 nodes))) +``` + +we needed it only for a debug purpose. + +Now start our scraper: + +``` +CL-USER> (scrapycl:start (make-instance 'quotes-spider) :wait t) +(# + # +... +``` + +As you can see, by default it returns a list of all items on the page, but in real world you will want this data to be saved or processed. In the next part we'll see how to do this. + +""" + ) + + +(defsection @storing-data (:title "Storing the Scraped Data") + """ +Scrapy's tutorial shows this command as example on how to save scraped data to a json file: + +``` +scrapy crawl quotes -O quotes.json +``` + +With ScrapyCL we can do the similar but with OUTPUT argument to the SCRAPYCL:START generic-function and SCRAPYCL:JSON-LINES function: + +``` +CL-USER> (scrapycl:start (make-instance 'quotes-spider) + :wait t + :output (scrapycl:json-lines #P"items.json")) +``` + +And content of `items.json` file will look like: + +```json +{"text":"“A day without sunshine is like, you know, night.”","author":"Steve Martin","tags":["humor","obvious","simile"]} +{"text":"“A woman is like a tea bag; you never know how strong it is until it's in hot water.”","author":"Eleanor Roosevelt","tags":["misattributed-eleanor-roosevelt"]} +``` + +Each object is on it's own line in a [JsonLines](https://jsonlines.org/) format. If you want to get a JSON file with a list, then use `SCRAPYCL:JSON-LIST` instead. Or use `SCRAPYCL:JSON-DICT` to get a file with JSON object. But beware, these two outputs can't work in :APPEND mode. + + +### Custom processing + +Actually, OUTPUT argument accepts any lisp function. The only requirements are: + +- This function should accept a single argument. The objects for which there is no specialized method of SCRAPYCL:PROCESS generic-function will be passed into this function. +- It should accept SCRAPYCL:STOP-OUTPUT symbol and flush buffer, closing a file or a transaction, because this symbol is sent when all hrefs were processed and there is no more data to process. + +""" + ) + + +(defsection @following-links (:title "Following the Links") + """ +In Scrapy framework for following links you should yield a request object like this: + +``` +yield scrapy.Request(next_page, callback=self.parse) +``` + +With ScrapyCL to follow links, you need to return new request objects from your method for SCRAPYCL:PROCESS generic-function. And because you are returning an object of a customized request class, you can add more data slots to the request to make this additional data available during request processing. For example, such data might include a parent category of the page or an some piece of data available on other page. + +Scrapy's test site contains quotes and their authors. Let's make our scraper parse not only quotes but also their authors. See `tutorial/step3.lisp` file for the full code for this part. + +First, we need to add an AUTHOR-ITEM class: + +``` +CL-USER> (defclass author-item () + ((name :initarg :name + :type string + :reader author-name) + (birthday :initarg :birthday + :type string + :reader author-birthday) + (bio :initarg :bio + :type string + :reader author-bio))) +# + +CL-USER> (defmethod print-object ((obj author-item) stream) + (print-unreadable-object (obj stream :type t) + (format stream "~A" + (author-name obj)))) +# +``` + +Now let's make our spider do follow links leading to the next page and to the authors pages. + +Here is how we can extract the link to the next page: + +``` +CL-USER> (lquery:$1 + (initialize *response*) + "ul.pager a" + (attr "href")) +"/page/2/" +``` + +But we need an absolute URL for request. So we have to merge this path with a base URL. + +SCRAPYCL:FETCH generic-function returns current page's real URL as a second value. Also, ScrapyCL provides a MERGE-WITH-URL lquery form. Together they can be used like this: + +``` +CL-USER> (multiple-value-bind (response base-url) + (scrapycl:fetch (make-instance 'quotes-spider) + (make-instance 'quotes-page-request + :url "https://quotes.toscrape.com/")) + (lquery:$1 + (initialize response) + "ul.pager a" + (attr "href") + (merge-url-with base-url))) +"https://quotes.toscrape.com/page/2/" +``` + +It is better to use URL returned by SCRAPYCL:FETCH generic-function because of these two reasons: + +- This URL can differ from original request URL because site might redirect request to the other page. +- Href attributes on the page can be relative, like `../quotes/page/1` and will not work if you'll hardcode base url. + +Let's figure out which author pages should be followed. Original Scrapy tutorial uses this CSS selector `.author + a`, but lquery does not support `+` selector. To find a siblings of `.author` element but we can use `NEXT` form to select subsequent element following the `author` node: + +``` +CL-USER> (multiple-value-bind (response base-url) + (scrapycl:fetch (make-instance 'quotes-spider) + (make-instance 'quotes-page-request + :url "https://quotes.toscrape.com/")) + (lquery:$ + (initialize response) + ".author" + (next "a") + (attr "href") + (merge-url-with base-url))) +#("https://quotes.toscrape.com/author/Albert-Einstein" + "https://quotes.toscrape.com/author/J-K-Rowling" + "https://quotes.toscrape.com/author/Albert-Einstein" + "https://quotes.toscrape.com/author/Jane-Austen" + "https://quotes.toscrape.com/author/Marilyn-Monroe" + "https://quotes.toscrape.com/author/Albert-Einstein" + "https://quotes.toscrape.com/author/Andre-Gide" + "https://quotes.toscrape.com/author/Thomas-A-Edison" + "https://quotes.toscrape.com/author/Eleanor-Roosevelt" + "https://quotes.toscrape.com/author/Steve-Martin") +``` + +Ok, now, when we have a URLs to follow, let's modify our processing function to return them as new requests: + +``` +CL-USER> (defclass author-page-request (scrapycl:request) + ()) +# + + +CL-USER> (defmethod scrapycl:process ((spider quotes-spider) + (request quotes-page-request)) + (multiple-value-bind (data base-url) + (scrapycl:fetch spider request) + (log:info "Fetched" base-url) + + (let ((quotes (lquery:$ + (initialize data) + "div.quote" + (combine + (lquery:$1 + "span.text" + (text)) + (lquery:$1 + "small.author" + (text)) + (lquery:$ + "div.tags a.tag" + (text))) + (map-apply + (lambda (text author tags) + (make-instance 'quote-item + :text text + :author author + :tags (coerce tags 'list)))))) + (next-page-url (lquery:$1 + (initialize data) + "ul.pager a" + (attr "href") + (merge-url-with base-url))) + (author-urls (lquery:$ + (initialize data) + ".author" + (next "a") + (attr "href") + (merge-url-with base-url)))) + ;; Now return objects and new requests + (list quotes + (map 'list (lambda (url) + (make-instance 'author-page-request + :url url)) + author-urls) + (when next-page-url + (make-instance 'quotes-page-request + :url next-page-url)))))) +# +``` + +We return objects of three types from this processing method: quote-items, quotes-page-requests and author-page-requests. + +Now if we will run our scraper, then we'll see it walks only through quotes pages and ignores author pages: + +``` +CL-USER> (scrapycl:start (make-instance 'quotes-spider) + :wait t + :output (scrapycl:json-lines #P"items.json")) + [19:25:21] cl-user (process quotes-spider quotes-page-request) - + Fetched BASE-URL: # + + [19:25:21] cl-user (process quotes-spider quotes-page-request) - + Fetched BASE-URL: # + +NIL + +``` + +But in the file `items.json` you might see interesting records: + +``` +{"url":"https://quotes.toscrape.com/author/Steve-Martin"} +{"url":"https://quotes.toscrape.com/author/Eleanor-Roosevelt"} +{"url":"https://quotes.toscrape.com/author/Thomas-A-Edison"} +{"url":"https://quotes.toscrape.com/author/Andre-Gide"} +... +``` + +This is because we forgot to define a processing method for our class AUTHOR-PAGE-REQUEST. ScrapyCL sees objects without a processing method and decides these are final objects to be serialized to the output. Let's write a method to extract information about authors as well. + +Here I've just translated these Python rules: + + +``` +def parse_author(self, response): + def extract_with_css(query): + return response.css(query).get(default="").strip() + + yield { + "name": extract_with_css("h3.author-title::text"), + "birthdate": extract_with_css(".author-born-date::text"), + "bio": extract_with_css(".author-description::text"), + } +``` + +into the lquery DSL: + +``` +CL-USER> (multiple-value-bind (response) + (scrapycl:fetch (make-instance 'quotes-spider) + (make-instance 'author-page-request + :url "https://quotes.toscrape.com/author/Thomas-A-Edison")) + (lquery:$1 + (initialize response) + (combine + (lquery:$1 + "h3.author-title" + (text)) + (lquery:$1 + ".author-born-date" + (text)) + (lquery:$1 + ".author-description" + (text) + (map #'str:trim))))) + +("Thomas A. Edison" "February 11, 1847" + "Thomas Alva Edison was an American inventor, scientist and businessman who developed many devices that greatly influenced life around the world, including the phonograph, the motion picture camera, and a long-lasting, practical electric light bulb. Dubbed \"The Wizard of Menlo Park\" (now Edison, New Jersey) by a newspaper reporter, he was one of the first inventors to apply the principles of mass production and large teamwork to the process of invention, and therefore is often credited with the creation of the first industrial research laboratory.Edison is considered one of the most prolific inventors in history, holding 1,093 U.S. patents in his name, as well as many patents in the United Kingdom, France and Germany. He is credited with numerous inventions that contributed to mass communication and, in particular, telecommunications. His advanced work in these fields was an outgrowth of his early career as a telegraph operator. Edison originated the concept and implementation of electric-power generation and distribution to homes, businesses, and factories – a crucial development in the modern industrialized world. His first power station was on Manhattan Island, New York.") +``` + +And here is the full processing method which will return an author object: + + +``` +CL-USER> (defmethod scrapycl:process ((spider quotes-spider) + (request author-page-request)) + (multiple-value-bind (data base-url) + (scrapycl:fetch spider request) + (log:info "Fetched" base-url) + + (lquery:$1 + (initialize data) + (combine + (lquery:$1 + "h3.author-title" + (text)) + (lquery:$1 + ".author-born-date" + (text)) + (lquery:$1 + ".author-description" + (text) + (map #'str:trim))) + (map-apply + (lambda (name birthday bio) + (make-instance 'author-item + :name name + :birthday birthday + :bio bio)))))) + +# +``` + + +Now, if you start the our spider again, you'll get quotes and authors mixed in the same `items.json` file. + +But how to put different kinds of object into a different output files? + +This is easy - just use a SCRAPYCL:TYPED-OUTPUT function. This kind of output redirects items into another outputs depending on their type. + +To separate output into `quotes.json` and `authors.json`, execute our scraper like this: + +``` +CL-USER> (scrapycl:start (make-instance 'quotes-spider) + :wait t + :output (scrapycl:typed-output + (list (cons 'quote-item + (scrapycl:json-lines #P"quotes.json")) + (cons 'author-item + (scrapycl:json-lines #P"authors.json"))))) + [19:29:43] cl-user (process quotes-spider quotes-page-request) - + Fetched BASE-URL: # + + [19:29:43] cl-user (process quotes-spider quotes-page-request) - + Fetched BASE-URL: # + + ... + + [19:29:48] cl-user (process quotes-spider quotes-page-request) - + Fetched BASE-URL: # + +NIL +``` + +It will save each type of item in a separate file. + +I hope this little introduction will urge you to try ScrapyCL for writing your own data scrapers! Feel free to share your ideas on the project's [discussions page](https://github.com/40ants/scrapycl/discussions). +""" + ) diff --git a/scrapycl.asd b/scrapycl.asd index ee73055..512d176 100644 --- a/scrapycl.asd +++ b/scrapycl.asd @@ -13,7 +13,9 @@ "scrapycl/request" "scrapycl/spider" "scrapycl/downloader" - "scrapycl/utils") + "scrapycl/utils" + "scrapycl/output/json" + "scrapycl/output/typed") :in-order-to ((test-op (test-op "scrapycl-tests")))) diff --git a/src/core.lisp b/src/core.lisp index aafcfea..17e2838 100644 --- a/src/core.lisp +++ b/src/core.lisp @@ -1,15 +1,28 @@ (uiop:define-package #:scrapycl (:use #:cl) + (:import-from #:40ants-doc/ignored-words + #:ignore-words-in-package) (:nicknames #:scrapycl/core) (:export #:spider #:start - #:url #:process #:enqueue #:request - #:fetch)) + #:fetch) + (:export #:url) + (:export #:stop-output) + (:export #:fetch-error + #:scrapycl-error) + (:export #:request-url + #:request) + (:export #:typed-output) + (:export #:json-lines + #:json-list + #:json-dict + #:write-as-json) + (:export #:preview)) (in-package #:scrapycl) -(deftype url () - 'string) +(ignore-words-in-package 'stop-output + 'url) diff --git a/src/downloader.lisp b/src/downloader.lisp index 70b6330..ce587ca 100644 --- a/src/downloader.lisp +++ b/src/downloader.lisp @@ -14,6 +14,13 @@ (defgeneric fetch (spider request &key max-redirects timeout custom-headers) + (:documentation "Fetches page from request's URL. + + Returns a multiple values: + + - A string with HTML response. + - URL from which response was received. Might be different from original URL because of redirects. + - A hash-table with reponse HTTP headers.") (:method ((spider spider) (request request) &key max-redirects timeout (custom-headers '(("User-Agent" . "ScrapyCL (https://40ants.com/scrapycl/)")))) @@ -31,4 +38,5 @@ :body body :headers headers)) (values body - last-uri))))) + last-uri + headers))))) diff --git a/src/engine.lisp b/src/engine.lisp index 3ff8e10..86fc3b3 100644 --- a/src/engine.lisp +++ b/src/engine.lisp @@ -16,13 +16,12 @@ (:import-from #:scrapycl/task) (:import-from #:log4cl-extras/error #:with-log-unhandled) - (:import-from #:scrapycl/output + (:import-from #:scrapycl/core #:stop-output) (:import-from #:serapeum #:->) (:import-from #:scrapycl/request - #:request-url) - (:export #:clear-queue)) + #:request-url)) (in-package #:scrapycl/engine) diff --git a/src/errors.lisp b/src/errors.lisp index 14b5eff..60eef08 100644 --- a/src/errors.lisp +++ b/src/errors.lisp @@ -1,14 +1,15 @@ (uiop:define-package #:scrapycl/errors (:use #:cl) - (:import-from #:quri) - (:export - #:fetch-error - #:scrapycl-error)) + (:import-from #:scrapycl/core + #:fetch-error + #:scrapycl-error) + (:import-from #:quri)) (in-package #:scrapycl/errors) (define-condition scrapycl-error (error) - ()) + () + (:documentation "Base class for all ScrapyCL errors.")) (define-condition fetch-error (scrapycl-error) @@ -16,6 +17,7 @@ (status :initarg :status) (body :initarg :body) (headers :initarg :headers)) + (:documentation "This condition is signalled when SCRAPYCL:FETCH generic-function gets non 200 status code.") (:report (lambda (condition stream) (with-slots (url status) condition diff --git a/src/output.lisp b/src/output.lisp deleted file mode 100644 index 777b376..0000000 --- a/src/output.lisp +++ /dev/null @@ -1,5 +0,0 @@ -(uiop:define-package #:scrapycl/output - (:use #:cl) - (:export #:stop-output)) -(in-package #:scrapycl/output) - diff --git a/src/output/json.lisp b/src/output/json.lisp index 56a92d1..eb3d712 100644 --- a/src/output/json.lisp +++ b/src/output/json.lisp @@ -7,11 +7,12 @@ (:import-from #:bt2) (:import-from #:closer-mop #:class-slots) - (:export #:json-file - #:write-as-json - #:json-lines - #:json-list - #:json-dict)) + (:import-from #:scrapycl/core + #:stop-output + #:write-as-json + #:json-lines + #:json-list + #:json-dict)) (in-package #:scrapycl/output/json) @@ -35,58 +36,25 @@ (values)))) - - -;; (defun json-lines (filename &key (if-exists :append)) -;; (let ((stream nil) -;; (stream-closed nil) -;; (lock (bt2:make-lock :name (fmt "Lock on ~A" filename)))) -;; (labels ((ensure-stream-is-opened () -;; (unless stream -;; (setf stream -;; (open filename -;; :direction :output -;; :if-does-not-exist :create -;; :if-exists if-exists)))) - -;; (close-stream () -;; (when stream -;; (close stream) -;; (setf stream nil) -;; (setf stream-closed -;; t))) - -;; (serialize (object) -;; (bt2:with-lock-held (lock) -;; (when stream-closed -;; (error "Stream to ~A was closed. Create a new JSON-FILE output." -;; filename)) - -;; (ensure-stream-is-opened) -;; (cond -;; ((eql object 'scrapycl/output:stop-output) -;; (close-stream)) -;; (t -;; (write-as-json object stream) -;; (terpri stream)))))) -;; (values #'serialize)))) - - (-> make-json-output ((or string pathname) &key (:if-exists (or null keyword)) (:after-stream-opened (or null function)) (:before-stream-closed (or null function)) + (:before-first-object (or null function)) (:before-each-object (or null function))) (values function &optional)) + (defun make-json-output (filename &key (if-exists :supersede) (after-stream-opened nil) (before-stream-closed nil) + (before-first-object nil before-first-object-given-p) (before-each-object nil)) "Internal function to reuse logic of stream opening and closing." (let ((stream nil) (stream-closed nil) + (first-object t) (lock (bt2:make-lock :name (fmt "Lock on ~A" filename)))) (labels ((ensure-stream-is-opened () (unless stream @@ -110,16 +78,28 @@ (serialize (object) (bt2:with-lock-held (lock) (when stream-closed - (error "Stream to ~A was closed. Create a new JSON-FILE output." + (error "Stream to ~A was closed. Create a new JSON output." filename)) (ensure-stream-is-opened) (cond - ((eql object 'scrapycl/output:stop-output) + ((eql object 'stop-output) (close-stream)) (t - (when before-each-object - (funcall before-each-object stream)) + (let ((hook + (cond + (first-object + (setf first-object nil) + ;; Check for before-first-object-given-p + ;; allows us to pass NIL in before-first-object + ;; argument to suppress any action for first item: + (if before-first-object-given-p + before-first-object + before-each-object)) + (t + before-each-object)))) + (when hook + (funcall hook stream))) (write-as-json object stream)))))) (values #'serialize)))) @@ -132,6 +112,7 @@ (defun json-lines (filename &key (if-exists :supersede)) (make-json-output filename :if-exists if-exists + :before-first-object nil :before-each-object #'terpri)) @@ -162,6 +143,7 @@ (values function &optional)) (defun json-dict (filename &key (key "items")) + "Creates an \"output\" callback for serializing objects as a list inside a JSON dictionary." (let ((first-item t)) (flet ((write-header (stream) (write-string "{\"" stream) diff --git a/src/output/typed.lisp b/src/output/typed.lisp index e1784bc..dbb4329 100644 --- a/src/output/typed.lisp +++ b/src/output/typed.lisp @@ -2,7 +2,9 @@ (:use #:cl) (:import-from #:serapeum #:->) - (:export #:typed-output)) + (:import-from #:scrapycl/core + #:stop-output + #:typed-output)) (in-package #:scrapycl/output/typed) @@ -14,7 +16,7 @@ (defun typed-output (type-to-output-alist) (flet ((process (object) (cond - ((eql object 'scrapycl/output:stop-output) + ((eql object 'stop-output) ;; This kind of object should be translated to all outputs (loop for item in type-to-output-alist for output = (cdr item) diff --git a/src/request.lisp b/src/request.lisp index 5b14e5d..51bc451 100644 --- a/src/request.lisp +++ b/src/request.lisp @@ -1,9 +1,10 @@ (uiop:define-package #:scrapycl/request (:use #:cl) (:import-from #:scrapycl/core - #:url - #:request) - (:export #:request-url)) + #:request + #:request-url) + (:import-from #:scrapycl/types + #:url)) (in-package #:scrapycl/request) diff --git a/src/types.lisp b/src/types.lisp new file mode 100644 index 0000000..f798f4a --- /dev/null +++ b/src/types.lisp @@ -0,0 +1,12 @@ +(uiop:define-package #:scrapycl/types + (:use #:cl) + (:import-from #:scrapycl/core + #:url) + (:import-from #:40ants-doc/ignored-words + #:ignore-words-in-package)) +(in-package #:scrapycl/types) + + +(deftype url () + "Represents a URL." + 'string) diff --git a/src/utils.lisp b/src/utils.lisp index 44d2cb1..5aff192 100644 --- a/src/utils.lisp +++ b/src/utils.lisp @@ -7,7 +7,8 @@ (:import-from #:quri) (:import-from #:serapeum #:fmt) - (:export #:preview)) + (:import-from #:scrapycl/core + #:preview)) (in-package #:scrapycl/utils) diff --git a/tutorial/step1.lisp b/tutorial/step1.lisp index 0f0fd20..aa16e67 100644 --- a/tutorial/step1.lisp +++ b/tutorial/step1.lisp @@ -1,5 +1,6 @@ (uiop:define-package #:scrapycl/tutorial/step1 (:use #:cl) + (:import-from #:log) (:import-from #:serapeum #:fmt) (:import-from #:str @@ -13,7 +14,7 @@ ()) -(defclass step1 (scrapycl:spider) +(defclass quotes-spider (scrapycl:spider) () (:default-initargs :initial-requests (list (make-instance 'quotes-page-request @@ -22,13 +23,14 @@ :url "https://quotes.toscrape.com/page/2/")))) -(defmethod scrapycl:process ((spider step1) +(defmethod scrapycl:process ((spider quotes-spider) (request quotes-page-request)) (multiple-value-bind (data url) (scrapycl:fetch spider request) - (let* ((page-number (third (split "/" (quri:uri-path url)))) - (filename (fmt "quotes-~A.html" page-number))) - (write-string-into-file data filename - :if-exists :supersede) + (let* ((page-number (third (str:split "/" (quri:uri-path url)))) + (filename (format nil "quotes-~A.html" page-number))) + (alexandria:write-string-into-file data filename + :if-exists :supersede) + (log:info "Page saved to" filename) ;; return nothing, to stop processing (values)))) diff --git a/tutorial/step2.lisp b/tutorial/step2.lisp index 1990c39..ae25c9a 100644 --- a/tutorial/step2.lisp +++ b/tutorial/step2.lisp @@ -9,18 +9,14 @@ (in-package #:scrapycl/tutorial/step2) -(defclass index-page-request (scrapycl:request) - ()) - - (defclass quotes-page-request (scrapycl:request) ()) -(defclass step2 (scrapycl:spider) +(defclass quotes-spider (scrapycl:spider) () (:default-initargs - :initial-requests (list (make-instance 'index-page-request + :initial-requests (list (make-instance 'quotes-page-request :url "https://quotes.toscrape.com/")))) @@ -44,8 +40,8 @@ (quote-tags obj)))) -(defmethod scrapycl:process ((spider step2) - (request index-page-request)) +(defmethod scrapycl:process ((spider quotes-spider) + (request quotes-page-request)) (let ((data (scrapycl:fetch spider request))) (lquery:$ (initialize data) diff --git a/tutorial/step3.lisp b/tutorial/step3.lisp index aeb0f4c..319f0a0 100644 --- a/tutorial/step3.lisp +++ b/tutorial/step3.lisp @@ -11,7 +11,7 @@ (in-package #:scrapycl/tutorial/step3) -(defclass index-page-request (scrapycl:request) +(defclass quotes-page-request (scrapycl:request) ()) @@ -19,11 +19,10 @@ ()) - (defclass quotes-spider (scrapycl:spider) () (:default-initargs - :initial-requests (list (make-instance 'index-page-request + :initial-requests (list (make-instance 'quotes-page-request :url "https://quotes.toscrape.com/")))) diff --git a/tutorial/tutorial.md b/tutorial/tutorial.md index baaf7ae..e69de29 100644 --- a/tutorial/tutorial.md +++ b/tutorial/tutorial.md @@ -1,751 +0,0 @@ -In this tutorial we'll train our parsing skill on this toy site: https://quotes.toscrape.com/. -We will follow [Scrapy's tutorial](https://docs.scrapy.org/en/latest/intro/tutorial.html) and see if we can get all the data using ScrapyCL. - - -Instead of: - -``` -scrapy shell "https://quotes.toscrape.com/page/1/" -``` - - -``` -SCRAPYCL/TUTORIAL/STEP1> (defparameter *response* - (scrapycl:fetch (make-instance 'step1) - (make-instance 'quotes-page-request - :url "https://quotes.toscrape.com/page/1/"))) -*RESPONSE* -SCRAPYCL/TUTORIAL/STEP1> *response* -" - - - - Quotes to Scrape - - >> response.css("title::text").getall() -['Quotes to Scrape'] -``` - -Lisp: - - -``` -SCRAPYCL/TUTORIAL/STEP1> (lquery:$ - (initialize *response*) - "title" - (text)) -#("Quotes to Scrape") -``` - -Python: - -``` -response.css("title").getall() -['Quotes to Scrape'] -``` - -Lisp: - -``` -SCRAPYCL/TUTORIAL/STEP1> (lquery:$ - (initialize *response*) - "title") -#(#) -SCRAPYCL/TUTORIAL/STEP1> (lquery:$ - (initialize *response*) - "title" - (serialize)) -#("Quotes to Scrape") -``` - -To get only a single item. Python: - -``` ->>> response.css("title::text").get() -'Quotes to Scrape' -``` - -In Lisp use `lquery:$1` instead of `lquery:$`: - -``` -SCRAPYCL/TUTORIAL/STEP1> (lquery:$1 - (initialize *response*) - "title" - (text)) -"Quotes to Scrape" -``` - - -As an alternative, you could’ve written in Python: - -``` ->>> response.css("title::text")[0].get() -'Quotes to Scrape' -``` - -In Lisp: - -``` -SCRAPYCL/TUTORIAL/STEP1> (lquery:$ - (initialize *response*) - "title") -#(#) - -SCRAPYCL/TUTORIAL/STEP1> (elt * 0) -# - -SCRAPYCL/TUTORIAL/STEP1> (plump:text *) -"Quotes to Scrape" -``` - -There is no analogue to `re`: - -``` ->>> response.css("title::text").re(r"Quotes.*") -['Quotes to Scrape'] -``` - -but you can use a filter function; - -``` -SCRAPYCL/TUTORIAL/STEP1> (lquery:$ - (initialize *response*) - "title" - (text) - (filter (lambda (text) - (cl-ppcre:scan "Quotes.*" text)))) -#("Quotes to Scrape") -``` - - -``` ->>> response.css("title::text").re(r"Q\w+") -['Quotes'] -``` - -Lisp: - -``` -SCRAPYCL/TUTORIAL/STEP1> (lquery:$ - (initialize *response*) - "title" - (text) - (each (lambda (text) - (cl-ppcre:scan-to-strings "Q\\w+" text)) - :replace t)) -#("Quotes") -``` - - -Python: - -``` ->>> response.css("title::text").re(r"(\w+) to (\w+)") -['Quotes', 'Scrape'] -``` - -Lisp: - - -``` -SCRAPYCL/TUTORIAL/STEP1> (lquery:$1 - (initialize *response*) - "title" - (text) - (map (lambda (text) - (nth-value 1 - (cl-ppcre:scan-to-strings "(\\w+) to (\\w+)" text))))) -#("Quotes" "Scrape") -``` - - -## Extracting quotes and authors (step2) - - -``` -SCRAPYCL/TUTORIAL/STEP2> (defparameter *response* - (scrapycl:fetch (make-instance 'step2) - (make-instance 'index-page-request - :url "https://quotes.toscrape.com/"))) -*RESPONSE* -``` - -Now lets extract quotes. Instead of this Python code: - -``` -response.css("div.quote") -[, -, -...] -``` - -we can do this in Lisp: - -``` -SCRAPYCL/TUTORIAL/STEP2> (lquery:$ - (initialize *response*) - "div.quote") -#(# # - # # - # # - # # - # #) -``` - -Here is how we can to limit the number of items to not clutter the REPL: - -``` -SCRAPYCL/TUTORIAL/STEP2> (lquery:$ - (initialize *response*) - "div.quote" - (function - (lambda (nodes) - (serapeum:take 2 nodes)))) -#(# #) -``` - -And preview the extracted pieces: - - -``` -SCRAPYCL/TUTORIAL/STEP2> (lquery:$ - (initialize *response*) - "div.quote" - (function - (lambda (nodes) - (serapeum:take 2 nodes))) - (serialize)) -#("
- “The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.” - by Albert Einstein - (about) - -
- Tags: - > - - change - - deep-thoughts - - thinking - - world - -
-
" - "
- “It is our choices, Harry, that show what we truly are, far more than our abilities.” - by J.K. Rowling - (about) - -
- Tags: - > - - abilities - - choices - -
-
") -``` - -Now let's extract the first quote. Instead of this code in Python: - -``` ->>> text = quote.css("span.text::text").get() ->>> text -'“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”' ->>> author = quote.css("small.author::text").get() ->>> author -'Albert Einstein' -``` - -we can do: - -``` -SCRAPYCL/TUTORIAL/STEP2> (lquery:$ - (initialize *response*) - "div.quote" - (function - (lambda (nodes) - (serapeum:take 2 nodes))) - (combine - (lquery:$1 - "span.text" - (text)) - (lquery:$1 - "small.author" - (text)))) -#(("“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”" - "Albert Einstein") - ("“It is our choices, Harry, that show what we truly are, far more than our abilities.”" - "J.K. Rowling")) -``` - -Note, this code we put after the `serapeum:take 2`: - -``` -(combine - (lquery:$1 - "span.text" - (text)) - (lquery:$1 - "small.author" - (text)))) -``` - -It allows us to extract two items from the `div.quote` node simultaneously, using function `combine`. These two pieces are combined into an array like: - -``` -#("“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”" - "Albert Einstein") -``` - -But because of functional nature of lquery, this `combine` operation is applied to all `div.quote` nodes. - -Next, original Scrapy's tutorial shows how to extract tags list for each quote: - -``` ->>> tags = quote.css("div.tags a.tag::text").getall() ->>> tags -['change', 'deep-thoughts', 'thinking', 'world'] -``` - -but knowing how does `combine` work, we can just add another rule into the `combine` form: - -``` -SCRAPYCL/TUTORIAL/STEP2> (lquery:$ - (initialize *response*) - "div.quote" - (function - (lambda (nodes) - (serapeum:take 2 nodes))) - (combine - (lquery:$1 - "span.text" - (text)) - (lquery:$1 - "small.author" - (text)) - (lquery:$ - "div.tags a.tag" - (text)))) -#(("“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”" - "Albert Einstein" #("change" "deep-thoughts" "thinking" "world")) - ("“It is our choices, Harry, that show what we truly are, far more than our abilities.”" - "J.K. Rowling" #("abilities" "choices"))) -``` - -Note, for tags we are using `lquery:$` because there is a list of them. - -Scrapy's tutorial creates a hash table for each quote, but ScrapyCL prefers to work with CLOS objects. So, we'll create a separate -`QUOTE` class: - -``` -(defclass quote-item () - ((text :initarg :text - :type string - :reader quote-text) - (author :initarg :author - :type string - :reader quote-author) - (tags :initarg :tags - :type (serapeum:soft-list-of string) - :reader quote-tags))) - - -(defmethod print-object ((obj quote-item) stream) - (print-unreadable-object (obj stream :type t) - (format stream "~A by ~A ~{#~A~^, ~}" - (quote-text obj) - (quote-author obj) - (quote-tags obj)))) -``` - -Now we'll use `map-apply` to transform parsed data into the CLOS objects: - -``` -SCRAPYCL/TUTORIAL/STEP2> (lquery:$ - (initialize *response*) - "div.quote" - (function - (lambda (nodes) - (serapeum:take 2 nodes))) - (combine - (lquery:$1 - "span.text" - (text)) - (lquery:$1 - "small.author" - (text)) - (lquery:$ - "div.tags a.tag" - (text))) - (map-apply - (lambda (text author tags) - (make-instance 'quote-item - :text text - :author author - :tags (coerce tags 'list))))) -#(# - #) -``` - -Put this piece of code into a PROCESS method: - -``` -(defmethod scrapycl:process ((spider step2) - (request index-page-request)) - (let ((data (scrapycl:fetch spider request))) - (lquery:$ - (initialize data) - "div.quote" - (combine - (lquery:$1 - "span.text" - (text)) - (lquery:$1 - "small.author" - (text)) - (lquery:$ - "div.tags a.tag" - (text))) - (map-apply - (lambda (text author tags) - (make-instance 'quote-item - :text text - :author author - :tags (coerce tags 'list))))))) -``` - -and don't forget to remove this piece with filter call: - -``` -(function - (lambda (nodes) - (serapeum:take 2 nodes))) -``` - -we needed it only for a debug purpose. - -Now start our scraper: - -``` -SCRAPYCL/TUTORIAL/STEP2> (scrapycl:start (make-instance 'step2) :wait t) -(# - # - # - # - # - # - # - # - # - #) -``` - -As you can see, by default it returns a list of all items on the page. - -## Storing the scraped data¶ - -Scrapy's tutorial shows this command as example on how to save scraped data to a json file: - -``` -scrapy crawl quotes -O quotes.json -``` - -With ScrapyCL we can do the similar but with OUTPUT argument to the START generic-function: - -``` -SCRAPYCL/TUTORIAL/STEP2> (scrapycl:start (make-instance 'step2) - :wait t - :output (scrapycl/output/json:json-lines #P"items.json")) -``` - -And content of `items.json` file will look like: - -```json -{"text":"“A day without sunshine is like, you know, night.”","author":"Steve Martin","tags":["humor","obvious","simile"]} -{"text":"“A woman is like a tea bag; you never know how strong it is until it's in hot water.”","author":"Eleanor Roosevelt","tags":["misattributed-eleanor-roosevelt"]} -``` - -Each object is on it's own line in a [JsonLines](https://jsonlines.org/) format. If you want to get a JSON file with a list, then use `SCRAPYCL/OUTPUT/JSON:JSON-LIST` instead. Or use `SCRAPYCL/OUTPUT/JSON:JSON-DICT` to get a file with JSON object. But beware, these two outputs can't work in :append mode. - - -### Custom processing - -Actually, :OUTPUT argument accepts any lisp function. The only requirements are: - -- it should accept a single argument. The objects for which there is no specialized method of PROCESS generic-function will be passed into this function. -- it should accept SCRAPYCL/OUTPUT:STOP-OUTPUT symbol and flush buffer, close file or transaction, because this symbol is sent when all links were processed and there is no more data to process. - - -## Following links - -Scrapy's tutorial shows that for following links you should yield a request like this: - -``` -yield scrapy.Request(next_page, callback=self.parse) -``` - -With ScrapyCL to follow links, just return new request objects from your method for PROCESS generic-function. And because you are returning an object of a customized request class, you can add more data slots to the request to make this additional data available during request processing. For example, such data might include a parent category of the page or an some piece of data available on other page. - -Scrapy's test site contains quotes and their authors. Let's make our scraper parse not only quotes but also their authors and to bind them by adding `author_id` to the quote. For simplicity I'll use a hash-table instead of the real database. See `tutorial/step3.lisp` file for full code for this part. - -First, we need to add an author-item class: - -``` -(defclass author-item () - ((name :initarg :name - :type string - :reader author-name) - (birthday :initarg :birthday - :type string - :reader author-birthday) - (bio :initarg :bio - :type string - :reader author-bio))) - - -(defmethod print-object ((obj author-item) stream) - (print-unreadable-object (obj stream :type t) - (format stream "~A" - (author-name obj)))) - -``` - -Now let's make our spider follow links leading to the next page and to authors pages: - -``` -SCRAPYCL/TUTORIAL/STEP3> (defparameter *response* - (scrapycl:fetch (make-instance 'quotes-spider) - (make-instance 'index-page-request - :url "https://quotes.toscrape.com/"))) - -SCRAPYCL/TUTORIAL/STEP3> (lquery:$1 - (initialize *response*) - "ul.pager a" - (attr "href")) -"/page/2/" -``` - -To make absolute URL for request we need to merge this path with a base URL. ScrapyCL's FETCH function returns current pages real URL as the second value. Also, it provides as MERGE-WITH-URL lquery form. Together they can be used like this: - -``` -SCRAPYCL/TUTORIAL/STEP3> (multiple-value-bind (response base-url) - (scrapycl:fetch (make-instance 'quotes-spider) - (make-instance 'index-page-request - :url "https://quotes.toscrape.com/")) - (lquery:$1 - (initialize response) - "ul.pager a" - (attr "href") - (merge-url-with base-url))) -"https://quotes.toscrape.com/page/2/" -``` - -It is better to use URL returned by fetch because of these two reasons: - -- this URL can differ from original request URL because site might redirect request to the other page. -- href attributes on the page can be relative, like `../quotes/page/1` and will not work if you hardcode base url. - -Let's figure out which author pages should be followed. Original Scrapy tutorial uses this CSS selector `.author + a`, but lquery does not support it. To find a siblings of `.author` element but we can use NEXT form to select subsequent element following the `author` node: - -``` -SCRAPYCL/TUTORIAL/STEP3> (multiple-value-bind (response base-url) - (scrapycl:fetch (make-instance 'quotes-spider) - (make-instance 'index-page-request - :url "https://quotes.toscrape.com/")) - (lquery:$ - (initialize response) - ".author" - (next "a") - (attr "href") - (merge-url-with base-url))) -#("https://quotes.toscrape.com/author/Albert-Einstein" - "https://quotes.toscrape.com/author/J-K-Rowling" - "https://quotes.toscrape.com/author/Albert-Einstein" - "https://quotes.toscrape.com/author/Jane-Austen" - "https://quotes.toscrape.com/author/Marilyn-Monroe" - "https://quotes.toscrape.com/author/Albert-Einstein" - "https://quotes.toscrape.com/author/Andre-Gide" - "https://quotes.toscrape.com/author/Thomas-A-Edison" - "https://quotes.toscrape.com/author/Eleanor-Roosevelt" - "https://quotes.toscrape.com/author/Steve-Martin") -``` - -Ok, now, when we have a URLs to follow, let modify our processing function to return them as new requests: - -``` -(defclass author-page-request (scrapycl:request) - ()) - - -(defmethod scrapycl:process ((spider quotes-spider) - (request index-page-request)) - (multiple-value-bind (data base-url) - (scrapycl:fetch spider request) - (log:info "Fetched" base-url) - - (let ((quotes (lquery:$ - (initialize data) - "div.quote" - (combine - (lquery:$1 - "span.text" - (text)) - (lquery:$1 - "small.author" - (text)) - (lquery:$ - "div.tags a.tag" - (text))) - (map-apply - (lambda (text author tags) - (make-instance 'quote-item - :text text - :author author - :tags (coerce tags 'list)))))) - (next-page-url (lquery:$1 - (initialize data) - "ul.pager a" - (attr "href") - (merge-url-with base-url))) - (author-urls (lquery:$ - (initialize data) - ".author" - (next "a") - (attr "href") - (merge-url-with base-url)))) - ;; Now return objects and new requests - (list quotes - (map 'list (lambda (url) - (make-instance 'author-page-request - :url url)) - author-urls) - (when next-page-url - (make-instance 'index-page-request - :url next-page-url)))))) -``` - -If we do just that and then run our scraper, then we'll see it walks only through quotes pages and ignores author pages: - -``` -SCRAPYCL/TUTORIAL/STEP3> (scrapycl:start (make-instance 'quotes-spider) - :wait t - :output (scrapycl/output/json:json-lines #P"items.json")) - [12:50:57] scrapycl/tutorial/step3 slime0lIMfb (process quotes-spider index-page-request) - - Fetched BASE-URL: # - [12:50:57] scrapycl/tutorial/step3 slime0lIMfb (process quotes-spider index-page-request) - - Fetched BASE-URL: # - - [12:50:57] scrapycl/tutorial/step3 slime0lIMfb (process quotes-spider index-page-request) - - Fetched BASE-URL: # -``` - -But in the file `items.json` you might see interesting records: - -``` -{"url":"https://quotes.toscrape.com/author/Steve-Martin"} -{"url":"https://quotes.toscrape.com/author/Eleanor-Roosevelt"} -{"url":"https://quotes.toscrape.com/author/Thomas-A-Edison"} -{"url":"https://quotes.toscrape.com/author/Andre-Gide"} -... -``` - -This is because we forgot to define a PROCESS method for our class AUTHOR-PAGE-REQUEST. ScrapyCL sees objects without a PROCESS method and decides these are final objects to be serialized to the output. Let's write a method to extract information about authors as well. - -Here I've just translated these Python rules: - - -``` -def parse_author(self, response): - def extract_with_css(query): - return response.css(query).get(default="").strip() - - yield { - "name": extract_with_css("h3.author-title::text"), - "birthdate": extract_with_css(".author-born-date::text"), - "bio": extract_with_css(".author-description::text"), - } -``` - -into the lquery DSL: - -``` -SCRAPYCL/TUTORIAL/STEP3> (multiple-value-bind (response) - (scrapycl:fetch (make-instance 'quotes-spider) - (make-instance 'author-page-request - :url "https://quotes.toscrape.com/author/Thomas-A-Edison")) - (lquery:$1 - (initialize response) - (combine - (lquery:$1 - "h3.author-title" - (text)) - (lquery:$1 - ".author-born-date" - (text)) - (lquery:$1 - ".author-description" - (text) - (map #'str:trim))))) - -("Thomas A. Edison" "February 11, 1847" - "Thomas Alva Edison was an American inventor, scientist and businessman who developed many devices that greatly influenced life around the world, including the phonograph, the motion picture camera, and a long-lasting, practical electric light bulb. Dubbed \"The Wizard of Menlo Park\" (now Edison, New Jersey) by a newspaper reporter, he was one of the first inventors to apply the principles of mass production and large teamwork to the process of invention, and therefore is often credited with the creation of the first industrial research laboratory.Edison is considered one of the most prolific inventors in history, holding 1,093 U.S. patents in his name, as well as many patents in the United Kingdom, France and Germany. He is credited with numerous inventions that contributed to mass communication and, in particular, telecommunications. His advanced work in these fields was an outgrowth of his early career as a telegraph operator. Edison originated the concept and implementation of electric-power generation and distribution to homes, businesses, and factories – a crucial development in the modern industrialized world. His first power station was on Manhattan Island, New York.") -``` - -And here is the full PROCESS method which will return an author object: - - -``` -(defmethod scrapycl:process ((spider quotes-spider) - (request author-page-request)) - (multiple-value-bind (data base-url) - (scrapycl:fetch spider request) - (log:info "Fetched" base-url) - - (lquery:$1 - (initialize data) - (combine - (lquery:$1 - "h3.author-title" - (text)) - (lquery:$1 - ".author-born-date" - (text)) - (lquery:$1 - ".author-description" - (text) - (map #'str:trim))) - (map-apply - (lambda (name birthday bio) - (make-instance 'author-item - :name name - :birthday birthday - :bio bio)))))) -``` - - -Now, if you start the spider again, you'll get quotes and authors mixed in the same `items.json` file. But how to put different kinds of object into a different output files? This is easy - just use a TYPED-OUTPUT. This kind of output redirects items into another outputs depending on their type. - -To separate output into `quotes.json` and `authors.json`, execute scraper like this: - -``` -SCRAPYCL/TUTORIAL/STEP3> (scrapycl:start (make-instance 'quotes-spider) - :wait t - :output (scrapycl/output/typed:typed-output - (list (cons 'quote-item - (scrapycl/output/json:json-lines #P"quotes.json")) - (cons 'author-item - (scrapycl/output/json:json-lines #P"authors.json"))))) -```