Practical Common Lisp in Racket

I thought it would be interesting to rewrite some of the example programs in Peter Seibel's Practical Common Lisp in Racket to compare and contrast the two languages. An example from the spam filter in chapter 23:

#lang racket/base

(require racket/function racket/list racket/match
         (for-syntax racket/base racket/list syntax/datum syntax/parse syntax/parse/class/struct-id)
         srfi/145 ; from my extra-srfi-libs collection
         soup-lib/parameter ; From my soup-lib collection
         )

;;; Spam filter from Chapter 23 of PCL in Racket.

; Let's stick with the CL convention of putting earmuffs on global variables
(define *feature-database* (make-hash))
(define *total-spams* 0)
(define *total-hams* 0)

; User-configurable values are more Rackety when parameters are used.
(define-parameter max-ham-score  0.4)
(define-parameter min-spam-score 0.6)

(define (classify text)
  (classification (score (extract-features text))))

; PCL uses a class for this but doesn't use any OO features, so just a struct will do.
; Making it transparent gets the human-readable printed version for free instead of having to
; add one manually like in the book.
(struct word-feature (word [spam-count #:mutable] [ham-count #:mutable])
  #:extra-constructor-name make-word-feature
  #:transparent)

; It often uses with-slots to destructure fields of the class:
;
;(with-slots (spam-count ham-count) feature body ...)
;
; This can be approximated with
;
; (match-let ([(struct* word-feature ([spam-count spam-count] [ham-count ham-count]) feature]) body ...)
;
; but that's verbose and repetitive if you want bindings the same as the field names. So...

(define-syntax with-slots
  (syntax-rules ()
    [(_ sid (field ...) instance body ...) ; Note having to explicitly give the struct type
     (match-let ([(struct* sid ([field field] ...)) instance])
       body ...)]))

(define (intern-feature! word)
  (hash-ref! *feature-database* word (thunk (make-word-feature word 0 0))))

; Could use sets here instead of a list I suppose.
(define (extract-words words)
  (remove-duplicates (regexp-match* #px"[a-zA-Z]{3,}" words) string=?))

; Who needs special syntax to pass a function as an argument? Take that LISP-2's!
(define (extract-features words)
  (map intern-feature! (extract-words words)))

; CL dolist can easily be replaced by Racket for
(define (train! text type)
  (for ([feature (in-list (extract-features text))])
    (increment-count! feature type)))

; Can write increment-count like this, but it's verbose and has those error-prone paired
; field mutators and accessors.
;
;(define (increment-count! feature type)
;  (case type
;    ((ham)
;     (set-word-feature-ham-count! feature (add1 (word-feature-ham-count feature))))
;    ((spam)
;     (set-word-feature-spam-count! feature (add1 (word-feature-spam-count feature))))
;    (else (error "Unknown type" type))))
;
; Let's make something like the CL incf macro instead:
(define-syntax (incf! stx)
  (syntax-parse stx
    [(_ var:id) ; A single variable
     #'(set! var (add1 var))]
    [(_ sid:struct-id (field:id instance:expr)) ; A struct field
     (let ([pos (index-of (datum (sid.field-sym ...)) (syntax-e #'field))])
       (unless pos
         (raise-syntax-error #f (format "No such field ~A in ~A" (syntax-e #'field) (syntax-e #'sid.descriptor-id)) stx #'field))
       (with-syntax ([setf! (list-ref (syntax->list #'((~? sid.mutator-id #f) ...)) pos)]
                     [getf  (list-ref (syntax->list #'(sid.accessor-id ...)) pos)])
         (unless (syntax-e #'setf!)
           (raise-syntax-error #f (format "Field ~A is not mutable in ~A" (syntax-e #'field) (syntax-e #'sid.descriptor-id)) stx #'field))
         #'(setf! instance (add1 (getf instance)))))]))
; CL gets the W here. This does show off some fancier syntax-parse macro stuff, though.

; CL ecase would be easy to make a macro for, but having an explicit else is pretty painless
(define (increment-count! feature type)
  (case type
    ((ham)  (incf! word-feature (ham-count feature)))
    ((spam) (incf! word-feature (spam-count feature)))
    (else   (error "Unknown type" type))))

(define (increment-total-count! type)
  (case type
    ((ham)  (incf! *total-hams*))
    ((spam) (incf! *total-spams*))
    (else   (error "Unknown type" type))))

(define (clear-database!)
  (hash-clear! *feature-database*)
  (set! *total-hams*  0)
  (set! *total-spams* 0))

; Maybe these math-heavy functions could be written in Typed Racket with Float values...

(define (spam-probability feature)
  (with-slots word-feature (spam-count ham-count) feature
    (let ([spam-frequency (/ spam-count (max 1 *total-spams*))]
          [ham-frequency  (/ ham-count  (max 1 *total-hams*))])
      (/ spam-frequency (+ spam-frequency ham-frequency)))))

(define (bayesian-spam-probability feature [assumed-probability 1/2] [weight 1])
  (with-slots word-feature (spam-count ham-count) feature
    (let ([basic-probability (spam-probability feature)]
          [data-points (+ spam-count ham-count)])
      (/ (+ (* weight assumed-probability)
            (* data-points basic-probability))
         (+ weight data-points)))))

; Very imperative CL code with lots of side effects. Just go with the flow instead of
; rewriting to be more functional for the moment.
(define (score features)
  (let-syntax ([push! (syntax-rules () [(_ new-head list) (set! list (cons new-head list))])])
    (let ([spam-probs '()]
          [ham-probs '()]
          [number-of-probs 0])
      (for ([feature (in-list features)]
            #:unless (untrained? feature))
        (let ([spam-prob (exact->inexact (bayesian-spam-probability feature))])
          (push! spam-prob spam-probs)
          (push! (- 1.0 spam-prob) ham-probs)
          (incf! number-of-probs)))
      (let ([h (- 1.0 (fisher spam-probs number-of-probs))]
            [s (- 1.0 (fisher ham-probs number-of-probs))])
        (/ (+ (- 1.0 h) s) 2.0)))))

(define (untrained? feature)
  (with-slots word-feature (spam-count ham-count) feature
    (and (zero? spam-count) (zero? ham-count))))

; Use for/sum instead of a reduce-like HOF
(define (fisher probs number-of-probs)
  (inverse-chi-square
   (* -2.0 (for/sum ([prob (in-list probs)]) (log prob)))
   (* 2 number-of-probs)))

; Replacing loop can be tricky when you're not really familiar with it
(define (inverse-chi-square value degrees-of-freedom)
  (assume (even? degrees-of-freedom)) ; Replace with a contract?
  ;; Due to rounding errors in the multiplication and exponentiation
  ;; the sum computed in the loop may end up a shade above 1.0 which
  ;; we can't have since it's supposed to represent a probability.
  (min
   (let ([m (/ value 2.0)])
     (for/fold ([prob (exp (- m))]
                [sum 0.0]
                #:result (+ sum prob))
               ([i (in-range 1 (quotient degrees-of-freedom 2))])
       (values (* prob (/ m i)) (+ sum prob))))
   1.0))

(define (classification score)
  (values
   (cond
     ((<= score (max-ham-score)) 'ham)
     ((>= score (min-spam-score)) 'spam)
     (else 'unsure))
   score))

; Examples from the book
(train! "Make money fast" 'spam)
(classify "Make money fast")
(classify "Want to go to the movies?")
(train! "Do you have any money for the movies?" 'ham)
(classify "Make money fast")
(classify "Want to go to the movies?")


;; TODO: Test framework and file loader

If I stay motivated to work more on this project, I'll probably add commentary in Scribble docs and publish a Racket package with the programs.

5 Likes

I had done chapter 24 and 25:

1 Like

Cool. I'd been thinking about the ID3 chapter next (Using binary data parser code I already have instead of the book version)

Got distracted from that by wanting a Common Lisp style format to replace the rather anemic built in one. Luckily, the implementation in SLIB was easy to get working with Racket.

I think the following macro may be useful to map binary values to symbols.

(define-syntax symbolic-binary
  (syntax-rules ()
    ((_ type (sym val) ...)
     (let ((t type))
       (binary
        (lambda (in)
          (let* ((value (read-value t in))
                 (symbol (cond
                           ((equal? value val) 'sym)
                           ...
                           (else (raise-argument-error
                                  'symbolic-binary "invalid value" value)))))
            symbol))
        (lambda (out symbol)
          (let ((value (cond
                         ((eqv? symbol 'sym) val)
                         ...
                         (else (raise-argument-error
                                'symbolic-binary "invalid symbol" symbol)))))
            (write-value t value))))))))

It can be used to parse the beginning of a TIFF file.

(define-binary-class tiff
  ((endian (endian-binary))
   (type (type-binary)))

  (define (endian-binary)
    (symbolic-binary (iso-8859-1-string 2)
                     (big    "MM")
                     (little "II")))

  (define/public (int16-binary)
    (case endian
      ((big) (integer-be 2))
      ((little) (integer-le 2))
      (else (raise-argument-error
             'int16-binary "endian symbol" endian))))

  (define (type-binary)
    (symbolic-binary (send this int16-binary)
                     (tiff          42)
                     (bigtiff       43)
                     (panasonic-rw2 85))))

Nice.

Is it OK to add this macro to the API of the binary-class package?

The macro is public domain.

BTW: I could not find a way to seek in a binary file. Is the idea of binary-class to read all of the file sequentially?

Yes. It is a parser-generator. It is possible to seek in the binary struct (you have direct access to the port, after all). But how to sync reading and generating the structure?

Sometimes only parts of the binary file are relevant for a particular use-case: for example a program reading and writing the EXIF data of a RAW file probably does not want to decode the Bayer sensor data. Or you have a huge container file with a directory at the end of the file and you just want to display the contents of the directory. In that case seeking would be much faster than reading and parsing. And sometimes writing is not necessary at all: for example a program, which sorts camera RAW files into directories according to the time stamps of the EXIF data. And sometimes you do not know how to decode parts of the file, because the format is proprietary and only half of the format is reverse engineered.

But file-position works fine. I just tested it. Thanks!