1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
------------------------------------------------------------------------------
--- This module contains a very simple parser for HTML documents.
---
--- @author Michael Hanus
--- @version October 2022
------------------------------------------------------------------------------

module HTML.Parser ( readHtmlFile, parseHtmlString )
 where

import Data.Char
import HTML.Base

------------------------------------------------------------------------------
--- Reads a file with HTML text and returns the corresponding HTML expressions.
--- @param file - the name of a file containing HTML text
--- @return a list of HTML expressions (if the file contains exactly one
---         HTML document, this list should contain exactly one element)
readHtmlFile :: HTML h => String -> IO [h]
readHtmlFile file = readFile file >>= return . parseHtmlString

--- Transforms an HTML string into a list of `BaseHTML` expressions.
--- If the HTML string is a well structured document, the list
--- of HTML expressions should contain exactly one element.
parseHtmlString :: HTML h => String -> [h]
parseHtmlString = map fromStaticHtml . parseHtml

--- Transforms an HTML string into a list of `StaticHtml` expressions.
--- If the HTML string is a well structured document, the list
--- of HTML expressions should contain exactly one element.
parseHtml :: String -> [StaticHtml]
parseHtml s = reverse (parseHtmlTokens [] (scanHtmlString s))

------------------------------------------------------------------------------

--- The data type for representing HTML tokens.
data HtmlToken = HTText String | HTElem String [(String,String)]

-- parse a list of HTML tokens into list of HTML expressions:
-- (first argument "helems" is a stack of already read tokens)
parseHtmlTokens :: [StaticHtml] -> [HtmlToken] -> [StaticHtml]
parseHtmlTokens helems []              = helems
parseHtmlTokens helems (HTText s : hs) =
 parseHtmlTokens (HText s : helems) hs
parseHtmlTokens helems (HTElem (t:ts) args : hs) =
  if t == '/'
    then let (structargs,elems,rest) = splitHtmlElems ts helems
         in parseHtmlTokens ([HStruct ts structargs elems] ++ rest) hs
    else parseHtmlTokens (HStruct (t:ts) args [] : helems) hs
parseHtmlTokens _ (HTElem [] _ : _) =
  error "Internal error in HTML.Parser.parseHtmlTokens: empty list in HTElem"


-- split the HTML token stack up to a particular token:
splitHtmlElems :: String -> [StaticHtml]
               -> ([(String,String)],[StaticHtml],[StaticHtml])
splitHtmlElems _ [] = ([],[],[])
splitHtmlElems tag (HText s : hs) =
  let (largs,elems,rest) = splitHtmlElems tag hs
  in (largs, elems ++ [HText s], rest)
splitHtmlElems tag (HStruct s args cont@(_:_) : hs) =
  let (largs,elems,rest) = splitHtmlElems tag hs
  in (largs, elems ++ [HStruct s args cont], rest)
splitHtmlElems tag (HStruct s args []: hs) =
  if tag==s
    then (args,[],hs)
    else let (largs,elems,rest) = splitHtmlElems tag hs
         in  (largs, elems ++ [HStruct s args []], rest)


-- scan an HTML string into list of HTML tokens:
scanHtmlString :: String -> [HtmlToken]
scanHtmlString s = scanHtml s
 where
  scanHtml []     = []
  scanHtml (c:cs) =
    if c=='<'
    then if take 3 cs == "!--"
         then scanHtmlComment cs
         else if take 4 (map toLower cs) == "pre>"
              then scanHtmlPre "" (skipFirstNewLine (drop 4 cs))
              else scanHtmlElem [] cs
    else let (initxt,remtag) = break (=='<') (c:cs)
          in HTText initxt : scanHtml remtag

-- scan an HTML element
scanHtmlElem :: String -> String -> [HtmlToken]
scanHtmlElem ct [] = [HTText ("&lt;"++ct)] -- incomplete element
scanHtmlElem ct (c:cs)
  | c=='>'    = (if null ct
                 then HTText "&lt;&gt;" -- invalid HTML, but we accept it...
                 else HTElem ct [])  : scanHtmlString cs
  | isSpace c =
     if null ct
     then HTText "&lt; " : scanHtmlString cs -- invalid HTML, but we accept it...
     else let (args,rest) = splitAtElement (=='>') (dropWhile isSpace cs)
              revargs = reverse args
           in if null args || head revargs /= '/'
              then HTElem ct (string2args args) : scanHtmlString rest
              else HTElem ct (string2args (reverse (tail revargs)))
                    : HTElem ('/':ct) [] : scanHtmlString rest
  | c=='/' && head cs == '>' = HTElem ct [] : HTElem ('/':ct) []
                                           : scanHtmlString (tail cs)
  | otherwise = scanHtmlElem (ct++[toLower c]) cs

-- scan an HTML comment
scanHtmlComment :: String -> [HtmlToken]
scanHtmlComment [] = []
scanHtmlComment (c:cs) =
  if c=='-' && take 2 cs == "->"
  then scanHtmlString (drop 2 cs)
  else scanHtmlComment cs

-- scan an HTML preformatted element
scanHtmlPre :: String -> String -> [HtmlToken]
scanHtmlPre _ [] = []  -- errorneous incomplete element
scanHtmlPre pre (c:cs) =
  if c=='<' && take 5 (map toLower cs) == "/pre>"
  then HTElem "pre" [] : HTText (reverse pre) : HTElem "/pre" []
       : scanHtmlString (drop 5 cs)
  else scanHtmlPre (c:pre) cs

-- split a string into blank separated list of strings:
string2args :: String -> [(String,String)]
string2args [] = []
string2args (c:cs) =
   let (arg1,rest) = splitAtElement isSpace (c:cs)
   in  deleteApo (splitAtElement (=='=') arg1)
        : string2args (dropWhile isSpace rest)

deleteApo :: (String,String) -> (String,String)
deleteApo (tag,[]) = (map toLower tag,[])
deleteApo (tag,c:cs) | c=='"'    = (map toLower tag, deleteLastApo cs)
                     | c=='\''   = (map toLower tag, deleteLastApo cs)
                     | otherwise = (map toLower tag, c:cs)

deleteLastApo :: String -> String
deleteLastApo [] = []
deleteLastApo [c] = if c=='"' || c=='\'' then [] else [c]
deleteLastApo (c1:c2:cs) = c1 : deleteLastApo (c2:cs)


-- split a list at the first element satisfying a predicate:
splitAtElement :: (a -> Bool) -> [a] -> ([a],[a])
splitAtElement _ [] = ([],[])
splitAtElement p (c:cs) =
  if p c then ([],cs)
         else let (first,rest) = splitAtElement p cs in (c:first,rest)

skipFirstNewLine :: String -> String
skipFirstNewLine [] = []
skipFirstNewLine (c:cs) =
  if c=='\n' then cs
             else if isSpace c then skipFirstNewLine cs else c:cs

-- end of HTML parser