(defconst +szotar-tones+
  '(("~" . high) ("/" . rising) ("\\" . falling)))
(defconst +szotar-diacritics+
  '((high . "̄") (rising . "́") (falling . "̀")))

(defun tone-class (str)
  "Return NIL for no tone and for tones 3 and 6;
return a list of \(consonants type first-vocal rest) otherwise.
In the case of ng, g takes the role of the vocal."
  (let ((match (string-match (concat "^\\([^aeiou]*\\)\\([aeiou]\\)"
    (if match
        (let ((consonant (match-string 1 str))
              (vocal (match-string 2 str))
              (rest (match-string 3 str))
              (type (match-string 4 str))
              (post (match-string 5 str)))
          (let ((tone (cdr (assoc type +szotar-tones+))))
            (when tone
              (list consonant tone vocal (concat rest post)))))
      (let ((match (string-match "^ng\\(h?\\)\\([~/\\]\\)\\(.*\\)$"
        (when match
          (let ((rest (match-string 1 str))
                (type (match-string 2 str))
                (post (match-string 3 str)))
            (let ((tone (cdr (assoc type +szotar-tones+))))
              (when tone
                (list "n" tone "g" (concat rest post))))))))))

(defun format-toned (consonants type first-vocal rest)
  (let ((diacritic (cdr (assoc type +szotar-diacritics+))))
    (concat consonants first-vocal diacritic rest)))

(defun parse-accents (str)
  "For every word, if it ends with ~, / or \\,
mark it with high, rising and falling, respectively."
  (let ((words (split-string str)))
      (while words
        (let ((toned-word (tone-class (car words))))
          (if toned-word
              (princ (apply #'format-toned toned-word))
            (princ (car words)))
          (when (cdr words)
            (princ " ")))
        (setq words (cdr words))))))

(defun cantonese-htmlize (str)
  (interactive "syuht ping: ")
  (kill-new (parse-accents str)))

(defun szotar-next-line ()
  "Takes the next line and prints its HTML representation."
  (if (re-search-forward "^\\(.*\\) = \\(.*\\)$" (point-max) t)
      (let* ((definition (match-string 2))
             (cantonese (parse-accents (match-string 1))))
        (princ (concat "<tr>\n"
                       "<td><span class=\"cantonese\">" cantonese "</span></td>\n"
                       "<td><span class=\"definition\">" definition "</span></td>\n"

(defun szotar-htmlize ()
  "Generates a CSS-driven HTML file from the current buffer.
The buffer is assumed to be in UTF-8 encoding."
  (with-output-to-temp-buffer (concat (buffer-name (current-buffer)) ".html")
    (princ (concat "<html>\n"
                   "<title>" (buffer-name (current-buffer)) "</title>\n"
                   "<link rel=\"stylesheet\" type=\"text/css\""
                   " href=\"szotar.css\" media=\"all\" />\n"
                   "<meta http-equiv=\"Content-Type\""
                   " content=\"text/html;charset=utf-8\" />\n"
      (goto-char (point-min))
      (while (szotar-next-line)))
    (princ "</table>\n</body>\n</html>")))