A while ago I posted about my horrible scraper code and about how I was too embarrassed to share it…

Well, it worked well! It emailed me nice “HTML diff’s” of how the page was changing, and it successfully notified me when the visa was open for applications.

I have decided to post it so that if someone wants to write a scraper/emailer in Lisp at least, they have an example. (I couldn’t find many examples of how to use a lot of these libraries to start with which made it quite hard.)

So excuse the mess, here’s the code:

(asdf:operate 'asdf:compile-op :curl)
(asdf:operate 'asdf:load-op :curl)
(require :cl-html-parse)
(require :htmlgen)
(require :cl-html-diff)
(require :cl-smtp)

;; TODO
; Fix logic so that it doesn't go crazy when the page / selection is empty
; Get rid of those horrible changed, same and new functions
; Get rid of email list at the top
; Tidy up the send mail function

;; Settings
(defparameter *mailserver* "localhost")

(defparameter *emails* '("kzar@kzar.co.uk" "fredbob@gmail.com"))

(defparameter *store* "/home/kzar/scrape/"
  "Where all the temp files are saved.")

(defparameter *log-file* "/var/log/visa-check.log")

;; Workings
(defvar *branches-above*)

(defun parse-html (html)
  "Take some HTML and return a tree."
  (NET.HTML.PARSER:parse-html html))

(defun craft-html (tree)
  "Take a tree and return some HTML"
  (with-output-to-string (stream)
    (net.html.generator:html-print tree stream)))

(defun load-file (filename)
  "Load the file given by filename and and return the contents as a string"
  (with-open-file (in filename)
    (with-standard-io-syntax
      (let ((seq (make-string (file-length in))))
        (read-sequence seq in)
        seq))))

(defun load-html (filename)
  "Load a HTML file and parse it into s-expressions"
  (parse-html (load-file filename)))

(defun save-string (filename string)
  "Write the string to filename (overwriting anything that might already be there)."
  (with-open-file (out filename
                       :direction :output
                       :if-exists :supersede)
    (with-standard-io-syntax
      (write-string string out))))

(defun save-html (filename html)
  "Save some s-expressions to HTML"
  (save-string filename (craft-html html)))

(defun append-string (filename string)
  "Append the string to the end of the file given by filename"
  (with-open-file (out filename
                       :direction :output
                       :if-exists :append)
    (with-standard-io-syntax
      (write-string string out))))

(defun time-stamp ()
  "Return a nicely formatted timestamp of the current date / time"
  (multiple-value-bind (second minute hour day month year)
      (decode-universal-time (get-universal-time))
    (format nil "~A/~A/~A ~A:~A:~A" day month year hour minute second)))

(defun log-message (message)
  "Append a log entry to the *log* file, including a timestamp"
  (append-string *log-file* (concatenate 'string (time-stamp) ": " message)))

(defun file-exists? (filename)
  "Return T if the filename exists, nil otherwise"
  (if (with-open-file (in filename :if-does-not-exist nil)
        in)
      t))

(defun download (URL)
  "Download the contents of the URL and return it"
  (curl:with-connection-returning-string (:cookies nil)
    (curl:set-option :url URL)
    (curl:perform)))

(defun strip-utf (string)
  "Return string minus any UTF characters."
  (remove-if (lambda (x) (> (char-code x) 127))
             string))

(defun send-email (from to subject body)
  "Use cl-smtp to send an email."
  (cl-smtp:send-email
   *mailserver*
   from
   to
   subject
   (format nil "<style type=\"text/css\">ins {background: #bfb} del{background: #fcc} ins,del {text-decoration: none}</style>~A" body)
   :extra-headers '(("Content-type" "text/html; charset=\"iso-8859-1\""))))

(defun spam (from subject body)
  "Spam everyone in *emails* the email given by the parameters"
  (dolist (email *emails*)
    (send-email from email subject body)))

(defun list-if (foo)
  "If foo's not nil return (list foo)."
  (if foo
      (list foo)))

(defun find-item (item list &key strict?)
  "Return item if item is inside the list, nil otherwise. If strict? is nil and item is a string, the list can contain a string that just contains the item."
  (if (or strict?
          (not (stringp item)))
      (find item list :test 'equal)
      (if (loop for hay in list
             if (stringp hay)
             if (search item hay) collect it
             else if (equal item hay) collect it)
          item)))

(defun find-all (symbols list &key strict?)
  "Take a list of symbols and do a find for each in the list. If strict? is nil then also grep strings for a match.."
  (if (not (consp symbols))
      (list-if (find-item symbols list :strict? strict?))
      (if (equal symbols
                 (loop for symbol in symbols
                    collect (find-item symbol list :strict? strict?)))
          symbols
          nil)))

(defun find-tag-helper (tag tree elements strict? branches-above)
  "Does the work for find-tag, horrible horrible mess! *Hangs head in shame*"
  (if (and (eql (car tree) tag)
           (or (not elements)
               (find-all elements tree :strict? strict?)))
      (progn
        (setf *branches-above* branches-above)
        tree)
      (let ((goal (or (if (consp (car tree))
                          (find-tag-helper tag (car tree) elements strict? branches-above))
                      (if (consp (cdr tree))
                          (find-tag-helper tag (cdr tree) elements strict? branches-above)))))
        (if (and (numberp *branches-above*)
                 (> *branches-above* 0))
            (progn
              (decf *branches-above*)
              tree)
            goal))))

(defun find-tag (tag tree &key elements (strict? t) (branches-above 0))
  "Take a tree, a tag and the tag's value and return the rest of the tree from that point."
  (let ((*branches-above* 0))
    (find-tag-helper tag tree elements strict? branches-above)))

(defmacro add-page (page-list url name &body body)
  `(push (list ,url ,name
               (lambda (html) ,@body)) ,page-list))

(defun check-page (URL name func)
  "Take a URL, download the page and have a look for differences"
  (let* ((filename (concatenate 'string *store* name "/old"))
         (new-file (funcall func (parse-html (download URL)))))
    (ensure-directories-exist filename)
    (if (file-exists? filename)
        (let ((old-file (car (load-html filename)))) ;; FIXME why is this (car needed?!
          (if (equal old-file new-file)
              (same-page name new-file)
              (changed-page name
                            (save-html filename new-file)
                            (html-diff:HTML-DIFF (craft-html old-file) (craft-html new-file)))))
        (new-page name (save-html filename new-file)))))

(defun new-page (title html)
  "Function that's called when a page is checked for the first time"
  (log-message (format nil "NEW Page - ~A~%" title))
  (spam "kzar@kzar.co.uk"
        (format nil "**Visa Alert** - New Page (~A)" title)
        (format nil "I have started watching this new page called \"~A\". ~%~%Here's the HTML:~%~%~A"  title html)))

(defun same-page (title html)
  "Called when the page is the same, (possibly does nothing!)"
  (log-message (format nil "no change to - ~A~%" title)))

(defun changed-page (title html diff)
  "Function that's called when a page changed. Email everyone and sound the sirens!"
  (log-message (format nil "CHANGED PAGE - ~A~%" title))
  (spam "kzar@kzar.co.uk"
        (format nil "**Visa Alert** - Changed Page (~A)" title)
        (format nil "The page ~A has changed! Below is a diff: ~%~%~A" title diff)))

(defun check-pages (pages)
  "Check all the pages given by for changes."
   (dolist (page pages)
     (check-page (car page) (cadr page) (caddr page))))

(let (pages)
  (add-page pages "http://www.bunac.org/UK/workcanada/applying.aspx" "Visa application Page"
    (find-tag :style html :elements '("#FF924A") :strict? nil :branches-above 2))

  (add-page pages "http://kzar.co.uk/testpage" "Test Page"
    (car html)) ; FIXME WHY IS THIS CAR REQUIRED?!

  (add-page pages "http://www.bunac.org/uk/workcanada/" "Visa Homepage"
    (find-tag :div html :elements '("width:480px") :strict? nil :branches-above 1))

  (add-page pages "https://www.bunac.org/applyCanada/" "Visa payment page"
    (find-tag :div html :elements "bodyContent" :strict? nil :branches-above 1))

  (check-pages pages))

(quit)

(Please bear in mind that I threw this code together very quickly. If you are using it for something important make sure it works correctly first!)

I had it running every hour by putting a shell script in my /etc/cron.hourly to run the code in a new SBCL:

#!/bin/bash
/usr/src/clbuild/clbuild preloaded --load /var/lisp/visa-check.cl

(That’s why there’s a (quit) at the end of the Lisp code. With a new SBCL instance firing up once an hour we need to make sure the old ones are closed!)

It seems to work pretty nicely anyway. It emails you notification of changes like this:

Change notification

Cheers, Dave