投稿‎ > ‎

Syntax tree parser in LISP

posted Mar 21, 2013, 7:54 AM by Zhang Wenxu   [ updated Jul 19, 2013, 9:00 AM ]
http://www.ioremap.net/node/325/

I’ve finally made it. Got really a lot of experience in functional-like thinking (and partially programming) and recalled bits of LISP I knew.

Parser can build a tree with concatenation, multiplication and OR nodes. It supports braces and symbol escapement. Here are couple of examples:

$ ./stree.lisp "\[a\*"
(cat * (cat a [))
$ ./stree.lisp "zxc*"
(mult (cat c (cat x z)))
$ ./stree.lisp "[a|b]*qwe"
(cat e (cat w (cat q (mult (or b a)))))

It uses rather dumb parser which does not differentiate priority or the control symbols, so string like “123*” will have multiplication operator applied to all previous symbols (namely concatenation of 1, 2 and 3). There are no error checks either, which I will likely work to fix, especially implement graceful exiting for non-closed braces.

Next step is to build a deterministic finite automate out of this LISP objects.

Code in LISP under the link.
I believe I’m not that bad as I started to believe.

#!/usr/bin/clisp

(defconstant *control-esc* #\\)
(defconstant *control-mult* #\*)
(defconstant *control-or* #\|)

(defconstant *control-cat-id* "cat")
(defconstant *control-or-id* "or")
(defconstant *control-mult-id* "mult")

(defconstant *control-object-open* #\[)
(defconstant *control-object-close* #\])

(defconstant *control-string* (concatenate 'string (list *control-mult*
							 *control-or*
							 *control-object-open*
							 *control-object-close*)))

(defun control-object-p (obj)
  (and (characterp obj) (or (char= obj *control-mult*) (char= obj *control-or*))))

(defun read-object (str len l total esc)
  (let ((sym (elt str 0))
	(skip 1))
    (cond
      (esc
        (push (list sym) l)
	(setf esc nil))
      ((char= sym *control-object-open*)
        (let* ((ret (read-object (subseq str skip) (- len skip) '() 0 nil))
	       (obj (first ret))
	       (obj_skip (second ret)))
	  (setf skip (+ skip obj_skip))
	  (push obj l)))
      ((char= sym *control-object-close*))
      ((char= sym *control-esc*)
       (setf esc 1))
      (t
	(push sym l)))
    (setf total (+ total skip))
    (if (or (<= len skip) (char= sym *control-object-close*))
      (list l total)
      (read-object (subseq str skip) (- len skip) l total esc))))

(defun wrap-control (obj)
  (cond
    ((not (characterp obj))
     *control-cat-id*)
    ((char= obj *control-mult*)
     *control-mult-id*)
    ((char= obj *control-or*)
     *control-or-id*)
    (t
     *control-cat-id*)))

(defun unwind-list (obj)
  (if (and (listp obj) (= (length obj) 1))
    (first obj)
    obj))

(defun parse-list (l)
  (if (null l)
    (list nil nil)
    (let* ((obj (pop l))
	   (lret (parse-list l))
	   (ret (first lret))
	   (is-or (second lret))
	   (l '()))
      (if (listp obj)
	(setf obj (first (parse-list obj))))
      (if (= (length ret) 1)
	(setf ret (first ret)))
      (if (null ret)
	(push (unwind-list obj) l)
	(progn
	  (cond
	    (is-or
	      (push ret l)
	      (push (unwind-list obj) l)
	      (push *control-or-id* l)
	      (setf is-or nil))
	    ((and (characterp obj) (char= obj *control-or*))
	      (setf is-or 1)
	      (push ret l))
	    (t
	      (push ret l)
	      (unless (control-object-p obj)
		(push (unwind-list obj) l))
	      (push (wrap-control obj) l)))))
      (list l is-or))))

(dolist (str *args*)
  (let ((ret (read-object str (length str) '() 0 nil)))
    (format t "~A~%" (first (parse-list (first ret))))))
Google+
By Zhang Wenxu
Comments