http urls monitor.

scanner.go 10KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452
  1. package scanner
  2. import (
  3. "bytes"
  4. "fmt"
  5. "os"
  6. "unicode"
  7. "unicode/utf8"
  8. "github.com/hashicorp/hcl/json/token"
  9. )
  10. // eof represents a marker rune for the end of the reader.
  11. const eof = rune(0)
  12. // Scanner defines a lexical scanner
  13. type Scanner struct {
  14. buf *bytes.Buffer // Source buffer for advancing and scanning
  15. src []byte // Source buffer for immutable access
  16. // Source Position
  17. srcPos token.Pos // current position
  18. prevPos token.Pos // previous position, used for peek() method
  19. lastCharLen int // length of last character in bytes
  20. lastLineLen int // length of last line in characters (for correct column reporting)
  21. tokStart int // token text start position
  22. tokEnd int // token text end position
  23. // Error is called for each error encountered. If no Error
  24. // function is set, the error is reported to os.Stderr.
  25. Error func(pos token.Pos, msg string)
  26. // ErrorCount is incremented by one for each error encountered.
  27. ErrorCount int
  28. // tokPos is the start position of most recently scanned token; set by
  29. // Scan. The Filename field is always left untouched by the Scanner. If
  30. // an error is reported (via Error) and Position is invalid, the scanner is
  31. // not inside a token.
  32. tokPos token.Pos
  33. }
  34. // New creates and initializes a new instance of Scanner using src as
  35. // its source content.
  36. func New(src []byte) *Scanner {
  37. // even though we accept a src, we read from a io.Reader compatible type
  38. // (*bytes.Buffer). So in the future we might easily change it to streaming
  39. // read.
  40. b := bytes.NewBuffer(src)
  41. s := &Scanner{
  42. buf: b,
  43. src: src,
  44. }
  45. // srcPosition always starts with 1
  46. s.srcPos.Line = 1
  47. return s
  48. }
  49. // next reads the next rune from the bufferred reader. Returns the rune(0) if
  50. // an error occurs (or io.EOF is returned).
  51. func (s *Scanner) next() rune {
  52. ch, size, err := s.buf.ReadRune()
  53. if err != nil {
  54. // advance for error reporting
  55. s.srcPos.Column++
  56. s.srcPos.Offset += size
  57. s.lastCharLen = size
  58. return eof
  59. }
  60. if ch == utf8.RuneError && size == 1 {
  61. s.srcPos.Column++
  62. s.srcPos.Offset += size
  63. s.lastCharLen = size
  64. s.err("illegal UTF-8 encoding")
  65. return ch
  66. }
  67. // remember last position
  68. s.prevPos = s.srcPos
  69. s.srcPos.Column++
  70. s.lastCharLen = size
  71. s.srcPos.Offset += size
  72. if ch == '\n' {
  73. s.srcPos.Line++
  74. s.lastLineLen = s.srcPos.Column
  75. s.srcPos.Column = 0
  76. }
  77. // debug
  78. // fmt.Printf("ch: %q, offset:column: %d:%d\n", ch, s.srcPos.Offset, s.srcPos.Column)
  79. return ch
  80. }
  81. // unread unreads the previous read Rune and updates the source position
  82. func (s *Scanner) unread() {
  83. if err := s.buf.UnreadRune(); err != nil {
  84. panic(err) // this is user fault, we should catch it
  85. }
  86. s.srcPos = s.prevPos // put back last position
  87. }
  88. // peek returns the next rune without advancing the reader.
  89. func (s *Scanner) peek() rune {
  90. peek, _, err := s.buf.ReadRune()
  91. if err != nil {
  92. return eof
  93. }
  94. s.buf.UnreadRune()
  95. return peek
  96. }
  97. // Scan scans the next token and returns the token.
  98. func (s *Scanner) Scan() token.Token {
  99. ch := s.next()
  100. // skip white space
  101. for isWhitespace(ch) {
  102. ch = s.next()
  103. }
  104. var tok token.Type
  105. // token text markings
  106. s.tokStart = s.srcPos.Offset - s.lastCharLen
  107. // token position, initial next() is moving the offset by one(size of rune
  108. // actually), though we are interested with the starting point
  109. s.tokPos.Offset = s.srcPos.Offset - s.lastCharLen
  110. if s.srcPos.Column > 0 {
  111. // common case: last character was not a '\n'
  112. s.tokPos.Line = s.srcPos.Line
  113. s.tokPos.Column = s.srcPos.Column
  114. } else {
  115. // last character was a '\n'
  116. // (we cannot be at the beginning of the source
  117. // since we have called next() at least once)
  118. s.tokPos.Line = s.srcPos.Line - 1
  119. s.tokPos.Column = s.lastLineLen
  120. }
  121. switch {
  122. case isLetter(ch):
  123. lit := s.scanIdentifier()
  124. if lit == "true" || lit == "false" {
  125. tok = token.BOOL
  126. } else if lit == "null" {
  127. tok = token.NULL
  128. } else {
  129. s.err("illegal char")
  130. }
  131. case isDecimal(ch):
  132. tok = s.scanNumber(ch)
  133. default:
  134. switch ch {
  135. case eof:
  136. tok = token.EOF
  137. case '"':
  138. tok = token.STRING
  139. s.scanString()
  140. case '.':
  141. tok = token.PERIOD
  142. ch = s.peek()
  143. if isDecimal(ch) {
  144. tok = token.FLOAT
  145. ch = s.scanMantissa(ch)
  146. ch = s.scanExponent(ch)
  147. }
  148. case '[':
  149. tok = token.LBRACK
  150. case ']':
  151. tok = token.RBRACK
  152. case '{':
  153. tok = token.LBRACE
  154. case '}':
  155. tok = token.RBRACE
  156. case ',':
  157. tok = token.COMMA
  158. case ':':
  159. tok = token.COLON
  160. case '-':
  161. if isDecimal(s.peek()) {
  162. ch := s.next()
  163. tok = s.scanNumber(ch)
  164. } else {
  165. s.err("illegal char")
  166. }
  167. default:
  168. s.err("illegal char: " + string(ch))
  169. }
  170. }
  171. // finish token ending
  172. s.tokEnd = s.srcPos.Offset
  173. // create token literal
  174. var tokenText string
  175. if s.tokStart >= 0 {
  176. tokenText = string(s.src[s.tokStart:s.tokEnd])
  177. }
  178. s.tokStart = s.tokEnd // ensure idempotency of tokenText() call
  179. return token.Token{
  180. Type: tok,
  181. Pos: s.tokPos,
  182. Text: tokenText,
  183. }
  184. }
  185. // scanNumber scans a HCL number definition starting with the given rune
  186. func (s *Scanner) scanNumber(ch rune) token.Type {
  187. zero := ch == '0'
  188. pos := s.srcPos
  189. s.scanMantissa(ch)
  190. ch = s.next() // seek forward
  191. if ch == 'e' || ch == 'E' {
  192. ch = s.scanExponent(ch)
  193. return token.FLOAT
  194. }
  195. if ch == '.' {
  196. ch = s.scanFraction(ch)
  197. if ch == 'e' || ch == 'E' {
  198. ch = s.next()
  199. ch = s.scanExponent(ch)
  200. }
  201. return token.FLOAT
  202. }
  203. if ch != eof {
  204. s.unread()
  205. }
  206. // If we have a larger number and this is zero, error
  207. if zero && pos != s.srcPos {
  208. s.err("numbers cannot start with 0")
  209. }
  210. return token.NUMBER
  211. }
  212. // scanMantissa scans the mantissa beginning from the rune. It returns the next
  213. // non decimal rune. It's used to determine wheter it's a fraction or exponent.
  214. func (s *Scanner) scanMantissa(ch rune) rune {
  215. scanned := false
  216. for isDecimal(ch) {
  217. ch = s.next()
  218. scanned = true
  219. }
  220. if scanned && ch != eof {
  221. s.unread()
  222. }
  223. return ch
  224. }
  225. // scanFraction scans the fraction after the '.' rune
  226. func (s *Scanner) scanFraction(ch rune) rune {
  227. if ch == '.' {
  228. ch = s.peek() // we peek just to see if we can move forward
  229. ch = s.scanMantissa(ch)
  230. }
  231. return ch
  232. }
  233. // scanExponent scans the remaining parts of an exponent after the 'e' or 'E'
  234. // rune.
  235. func (s *Scanner) scanExponent(ch rune) rune {
  236. if ch == 'e' || ch == 'E' {
  237. ch = s.next()
  238. if ch == '-' || ch == '+' {
  239. ch = s.next()
  240. }
  241. ch = s.scanMantissa(ch)
  242. }
  243. return ch
  244. }
  245. // scanString scans a quoted string
  246. func (s *Scanner) scanString() {
  247. braces := 0
  248. for {
  249. // '"' opening already consumed
  250. // read character after quote
  251. ch := s.next()
  252. if ch == '\n' || ch < 0 || ch == eof {
  253. s.err("literal not terminated")
  254. return
  255. }
  256. if ch == '"' {
  257. break
  258. }
  259. // If we're going into a ${} then we can ignore quotes for awhile
  260. if braces == 0 && ch == '$' && s.peek() == '{' {
  261. braces++
  262. s.next()
  263. } else if braces > 0 && ch == '{' {
  264. braces++
  265. }
  266. if braces > 0 && ch == '}' {
  267. braces--
  268. }
  269. if ch == '\\' {
  270. s.scanEscape()
  271. }
  272. }
  273. return
  274. }
  275. // scanEscape scans an escape sequence
  276. func (s *Scanner) scanEscape() rune {
  277. // http://en.cppreference.com/w/cpp/language/escape
  278. ch := s.next() // read character after '/'
  279. switch ch {
  280. case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', '"':
  281. // nothing to do
  282. case '0', '1', '2', '3', '4', '5', '6', '7':
  283. // octal notation
  284. ch = s.scanDigits(ch, 8, 3)
  285. case 'x':
  286. // hexademical notation
  287. ch = s.scanDigits(s.next(), 16, 2)
  288. case 'u':
  289. // universal character name
  290. ch = s.scanDigits(s.next(), 16, 4)
  291. case 'U':
  292. // universal character name
  293. ch = s.scanDigits(s.next(), 16, 8)
  294. default:
  295. s.err("illegal char escape")
  296. }
  297. return ch
  298. }
  299. // scanDigits scans a rune with the given base for n times. For example an
  300. // octal notation \184 would yield in scanDigits(ch, 8, 3)
  301. func (s *Scanner) scanDigits(ch rune, base, n int) rune {
  302. for n > 0 && digitVal(ch) < base {
  303. ch = s.next()
  304. n--
  305. }
  306. if n > 0 {
  307. s.err("illegal char escape")
  308. }
  309. // we scanned all digits, put the last non digit char back
  310. s.unread()
  311. return ch
  312. }
  313. // scanIdentifier scans an identifier and returns the literal string
  314. func (s *Scanner) scanIdentifier() string {
  315. offs := s.srcPos.Offset - s.lastCharLen
  316. ch := s.next()
  317. for isLetter(ch) || isDigit(ch) || ch == '-' {
  318. ch = s.next()
  319. }
  320. if ch != eof {
  321. s.unread() // we got identifier, put back latest char
  322. }
  323. return string(s.src[offs:s.srcPos.Offset])
  324. }
  325. // recentPosition returns the position of the character immediately after the
  326. // character or token returned by the last call to Scan.
  327. func (s *Scanner) recentPosition() (pos token.Pos) {
  328. pos.Offset = s.srcPos.Offset - s.lastCharLen
  329. switch {
  330. case s.srcPos.Column > 0:
  331. // common case: last character was not a '\n'
  332. pos.Line = s.srcPos.Line
  333. pos.Column = s.srcPos.Column
  334. case s.lastLineLen > 0:
  335. // last character was a '\n'
  336. // (we cannot be at the beginning of the source
  337. // since we have called next() at least once)
  338. pos.Line = s.srcPos.Line - 1
  339. pos.Column = s.lastLineLen
  340. default:
  341. // at the beginning of the source
  342. pos.Line = 1
  343. pos.Column = 1
  344. }
  345. return
  346. }
  347. // err prints the error of any scanning to s.Error function. If the function is
  348. // not defined, by default it prints them to os.Stderr
  349. func (s *Scanner) err(msg string) {
  350. s.ErrorCount++
  351. pos := s.recentPosition()
  352. if s.Error != nil {
  353. s.Error(pos, msg)
  354. return
  355. }
  356. fmt.Fprintf(os.Stderr, "%s: %s\n", pos, msg)
  357. }
  358. // isHexadecimal returns true if the given rune is a letter
  359. func isLetter(ch rune) bool {
  360. return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_' || ch >= 0x80 && unicode.IsLetter(ch)
  361. }
  362. // isHexadecimal returns true if the given rune is a decimal digit
  363. func isDigit(ch rune) bool {
  364. return '0' <= ch && ch <= '9' || ch >= 0x80 && unicode.IsDigit(ch)
  365. }
  366. // isHexadecimal returns true if the given rune is a decimal number
  367. func isDecimal(ch rune) bool {
  368. return '0' <= ch && ch <= '9'
  369. }
  370. // isHexadecimal returns true if the given rune is an hexadecimal number
  371. func isHexadecimal(ch rune) bool {
  372. return '0' <= ch && ch <= '9' || 'a' <= ch && ch <= 'f' || 'A' <= ch && ch <= 'F'
  373. }
  374. // isWhitespace returns true if the rune is a space, tab, newline or carriage return
  375. func isWhitespace(ch rune) bool {
  376. return ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r'
  377. }
  378. // digitVal returns the integer value of a given octal,decimal or hexadecimal rune
  379. func digitVal(ch rune) int {
  380. switch {
  381. case '0' <= ch && ch <= '9':
  382. return int(ch - '0')
  383. case 'a' <= ch && ch <= 'f':
  384. return int(ch - 'a' + 10)
  385. case 'A' <= ch && ch <= 'F':
  386. return int(ch - 'A' + 10)
  387. }
  388. return 16 // larger than any legal digit val
  389. }