1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
------------------------------------------------------------------------------
--- This module contains a very simple parser for HTML documents.
---
--- @author Michael Hanus
--- @version October 2020
------------------------------------------------------------------------------
{-# OPTIONS_CYMAKE -Wno-incomplete-patterns #-}

module HTML.Parser ( readHtmlFile, parseHtmlString )
 where

import Char
import HTML.Base

--- Reads a file with HTML text and returns the corresponding HTML expressions.
--- @param file - the name of a file containing HTML text
--- @return a list of HTML expressions (if the file contains exactly one
---         HTML document, this list should contain exactly one element)
readHtmlFile :: String -> IO [BaseHtml]
readHtmlFile file = readFile file >>= return . parseHtmlString

------------------------------------------------------------------------------
--- Transforms an HTML string into a list of HTML expressions.
--- If the HTML string is a well structured document, the list
--- of HTML expressions should contain exactly one element.
parseHtmlString :: String -> [BaseHtml]
parseHtmlString s = reverse (parseHtmlTokens [] (scanHtmlString s))

--- The data type for representing HTML tokens.
data HtmlToken = HText String | HElem String [(String,String)]

-- parse a list of HTML tokens into list of HTML expressions:
-- (first argument "helems" is a stack of already read tokens)
parseHtmlTokens :: [BaseHtml] -> [HtmlToken] -> [BaseHtml]
parseHtmlTokens helems []             = helems
parseHtmlTokens helems (HText s : hs) =
 parseHtmlTokens (BaseText s : helems) hs
parseHtmlTokens helems (HElem (t:ts) args : hs) =
 if t == '/'
   then let (structargs,elems,rest) = splitHtmlElems ts helems
        in parseHtmlTokens ([BaseStruct ts structargs elems] ++ rest) hs
   else parseHtmlTokens (BaseStruct (t:ts) args [] : helems) hs


-- split the HTML token stack up to a particular token:
splitHtmlElems :: String -> [BaseHtml]
               -> ([(String,String)],[BaseHtml],[BaseHtml])
splitHtmlElems _ [] = ([],[],[])
splitHtmlElems tag (BaseText s : hs) =
 let (largs,elems,rest) = splitHtmlElems tag hs
 in (largs, elems ++ [BaseText s], rest)
splitHtmlElems tag (BaseStruct s args cont@(_:_) : hs) =
 let (largs,elems,rest) = splitHtmlElems tag hs
 in (largs, elems ++ [BaseStruct s args cont], rest)
splitHtmlElems tag (BaseStruct s args []: hs) =
 if tag==s
   then (args,[],hs)
   else let (largs,elems,rest) = splitHtmlElems tag hs
        in  (largs, elems ++ [BaseStruct s args []], rest)


-- scan an HTML string into list of HTML tokens:
scanHtmlString :: String -> [HtmlToken]
scanHtmlString s = scanHtml s
 where
  scanHtml []     = []
  scanHtml (c:cs) =
    if c=='<'
    then if take 3 cs == "!--"
         then scanHtmlComment cs
         else if take 4 (map toLower cs) == "pre>"
              then scanHtmlPre "" (skipFirstNewLine (drop 4 cs))
              else scanHtmlElem [] cs
    else let (initxt,remtag) = break (=='<') (c:cs)
          in HText initxt : scanHtml remtag

-- scan an HTML element
scanHtmlElem :: String -> String -> [HtmlToken]
scanHtmlElem ct [] = [HText ("&lt;"++ct)] -- incomplete element
scanHtmlElem ct (c:cs)
  | c=='>'    = (if null ct
                 then HText "&lt;&gt;" -- invalid HTML, but we accept it...
                 else HElem ct [])  : scanHtmlString cs
  | isSpace c =
     if null ct
     then HText "&lt; " : scanHtmlString cs -- invalid HTML, but we accept it...
     else let (args,rest) = splitAtElement (=='>') (dropWhile isSpace cs)
              revargs = reverse args
           in if null args || head revargs /= '/'
              then HElem ct (string2args args) : scanHtmlString rest
              else HElem ct (string2args (reverse (tail revargs)))
                    : HElem ('/':ct) [] : scanHtmlString rest
  | c=='/' && head cs == '>' = HElem ct [] : HElem ('/':ct) []
                                           : scanHtmlString (tail cs)
  | otherwise = scanHtmlElem (ct++[toLower c]) cs

-- scan an HTML comment
scanHtmlComment :: String -> [HtmlToken]
scanHtmlComment [] = []
scanHtmlComment (c:cs) =
  if c=='-' && take 2 cs == "->"
  then scanHtmlString (drop 2 cs)
  else scanHtmlComment cs

-- scan an HTML preformatted element
scanHtmlPre :: String -> String -> [HtmlToken]
scanHtmlPre _ [] = []  -- errorneous incomplete element
scanHtmlPre pre (c:cs) =
  if c=='<' && take 5 (map toLower cs) == "/pre>"
  then HElem "pre" [] : HText (reverse pre) : HElem "/pre" []
       : scanHtmlString (drop 5 cs)
  else scanHtmlPre (c:pre) cs

-- split a string into blank separated list of strings:
string2args :: String -> [(String,String)]
string2args [] = []
string2args (c:cs) =
   let (arg1,rest) = splitAtElement isSpace (c:cs)
   in  deleteApo (splitAtElement (=='=') arg1)
        : string2args (dropWhile isSpace rest)

deleteApo :: (String,String) -> (String,String)
deleteApo (tag,[]) = (map toLower tag,[])
deleteApo (tag,c:cs) | c=='"'    = (map toLower tag, deleteLastApo cs)
                     | c=='\''   = (map toLower tag, deleteLastApo cs)
                     | otherwise = (map toLower tag, c:cs)

deleteLastApo :: String -> String
deleteLastApo [] = []
deleteLastApo [c] = if c=='"' || c=='\'' then [] else [c]
deleteLastApo (c1:c2:cs) = c1 : deleteLastApo (c2:cs)


-- split a list at the first element satisfying a predicate:
splitAtElement :: (a -> Bool) -> [a] -> ([a],[a])
splitAtElement _ [] = ([],[])
splitAtElement p (c:cs) =
  if p c then ([],cs)
         else let (first,rest) = splitAtElement p cs in (c:first,rest)

skipFirstNewLine :: String -> String
skipFirstNewLine [] = []
skipFirstNewLine (c:cs) =
  if c=='\n' then cs
             else if isSpace c then skipFirstNewLine cs else c:cs

-- end of HTML parser