nex.go 27 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237
  1. // Substantial copy-and-paste from src/pkg/regexp.
  2. package main
  3. import (
  4. "bufio"
  5. "errors"
  6. "fmt"
  7. "io"
  8. "io/ioutil"
  9. "log"
  10. "os"
  11. "sort"
  12. "strconv"
  13. "strings"
  14. )
  15. import (
  16. "go/format"
  17. "go/parser"
  18. "go/printer"
  19. "go/token"
  20. )
  21. type rule struct {
  22. regex []rune
  23. code string
  24. startCode string
  25. endCode string
  26. kid []*rule
  27. id string
  28. }
  29. var (
  30. ErrInternal = errors.New("internal error")
  31. ErrUnmatchedLpar = errors.New("unmatched '('")
  32. ErrUnmatchedRpar = errors.New("unmatched ')'")
  33. ErrUnmatchedLbkt = errors.New("unmatched '['")
  34. ErrUnmatchedRbkt = errors.New("unmatched ']'")
  35. ErrBadRange = errors.New("bad range in character class")
  36. ErrExtraneousBackslash = errors.New("extraneous backslash")
  37. ErrBareClosure = errors.New("closure applies to nothing")
  38. ErrBadBackslash = errors.New("illegal backslash escape")
  39. ErrExpectedLBrace = errors.New("expected '{'")
  40. ErrUnmatchedLBrace = errors.New("unmatched '{'")
  41. ErrUnexpectedEOF = errors.New("unexpected EOF")
  42. ErrUnexpectedNewline = errors.New("unexpected newline")
  43. ErrUnexpectedLAngle = errors.New("unexpected '<'")
  44. ErrUnmatchedLAngle = errors.New("unmatched '<'")
  45. ErrUnmatchedRAngle = errors.New("unmatched '>'")
  46. )
  47. func ispunct(c rune) bool {
  48. for _, r := range "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" {
  49. if c == r {
  50. return true
  51. }
  52. }
  53. return false
  54. }
  55. var escapes = []rune("abfnrtv")
  56. var escaped = []rune("\a\b\f\n\r\t\v")
  57. func escape(c rune) rune {
  58. for i, b := range escapes {
  59. if b == c {
  60. return escaped[i]
  61. }
  62. }
  63. return -1
  64. }
  65. const (
  66. kNil = iota
  67. kRune
  68. kClass
  69. kWild
  70. kStart
  71. kEnd
  72. )
  73. type edge struct {
  74. kind int // Rune/Class/Wild/Nil.
  75. r rune // Rune for rune edges.
  76. lim []rune // Pairs of limits for character class edges.
  77. negate bool // True if the character class is negated.
  78. dst *node // Destination node.
  79. }
  80. type node struct {
  81. e edges // Outedges.
  82. n int // Index number. Scoped to a family.
  83. accept bool // True if this is an accepting state.
  84. set []int // The NFA nodes represented by a DFA node.
  85. }
  86. type edges []*edge
  87. func (e edges) Len() int {
  88. return len(e)
  89. }
  90. func (e edges) Less(i, j int) bool {
  91. return e[i].r < e[j].r
  92. }
  93. func (e edges) Swap(i, j int) {
  94. e[i], e[j] = e[j], e[i]
  95. }
  96. type RuneSlice []rune
  97. func (p RuneSlice) Len() int { return len(p) }
  98. func (p RuneSlice) Less(i, j int) bool { return p[i] < p[j] }
  99. func (p RuneSlice) Swap(i, j int) { p[i], p[j] = p[j], p[i] }
  100. // Print a graph in DOT format given the start node.
  101. //
  102. // $ dot -Tps input.dot -o output.ps
  103. func writeDotGraph(outf *os.File, start *node, id string) {
  104. done := make(map[*node]bool)
  105. var show func(*node)
  106. show = func(u *node) {
  107. if u.accept {
  108. fmt.Fprintf(outf, " %v[style=filled,color=green];\n", u.n)
  109. }
  110. done[u] = true
  111. for _, e := range u.e {
  112. // We use -1 to denote the dead end node in DFAs.
  113. if e.dst.n == -1 {
  114. continue
  115. }
  116. label := ""
  117. runeToDot := func(r rune) string {
  118. if strconv.IsPrint(r) {
  119. return fmt.Sprintf("%v", string(r))
  120. }
  121. return fmt.Sprintf("U+%X", int(r))
  122. }
  123. switch e.kind {
  124. case kRune:
  125. label = fmt.Sprintf("[label=%q]", runeToDot(e.r))
  126. case kWild:
  127. label = "[color=blue]"
  128. case kClass:
  129. label = "[label=\"["
  130. if e.negate {
  131. label += "^"
  132. }
  133. for i := 0; i < len(e.lim); i += 2 {
  134. label += runeToDot(e.lim[i])
  135. if e.lim[i] != e.lim[i+1] {
  136. label += "-" + runeToDot(e.lim[i+1])
  137. }
  138. }
  139. label += "]\"]"
  140. }
  141. fmt.Fprintf(outf, " %v -> %v%v;\n", u.n, e.dst.n, label)
  142. }
  143. for _, e := range u.e {
  144. if !done[e.dst] {
  145. show(e.dst)
  146. }
  147. }
  148. }
  149. fmt.Fprintf(outf, "digraph %v {\n 0[shape=box];\n", id)
  150. show(start)
  151. fmt.Fprintln(outf, "}")
  152. }
  153. func inClass(r rune, lim []rune) bool {
  154. for i := 0; i < len(lim); i += 2 {
  155. if lim[i] <= r && r <= lim[i+1] {
  156. return true
  157. }
  158. }
  159. return false
  160. }
  161. var dfadot, nfadot *os.File
  162. func gen(out *bufio.Writer, x *rule) {
  163. s := x.regex
  164. // Regex -> NFA
  165. // We cannot have our alphabet be all Unicode characters. Instead,
  166. // we compute an alphabet for each regex:
  167. //
  168. // 1. Singles: we add single runes used in the regex: any rune not in a
  169. // range. These are held in `sing`.
  170. //
  171. // 2. Ranges: entire ranges become elements of the alphabet. If ranges in
  172. // the same expression overlap, we break them up into non-overlapping
  173. // ranges. The generated code checks singles before ranges, so there's no
  174. // need to break up a range if it contains a single. These are maintained
  175. // in sorted order in `lim`.
  176. //
  177. // 3. Wild: we add an element representing all other runes.
  178. //
  179. // e.g. the alphabet of /[0-9]*[Ee][2-5]*/ is sing: { E, e },
  180. // lim: { [0-1], [2-5], [6-9] } and the wild element.
  181. sing := make(map[rune]bool)
  182. var lim []rune
  183. var insertLimits func(l, r rune)
  184. // Insert a new range [l-r] into `lim`, breaking it up if it overlaps, and
  185. // discarding it if it coincides with an existing range. We keep `lim`
  186. // sorted.
  187. insertLimits = func(l, r rune) {
  188. var i int
  189. for i = 0; i < len(lim); i += 2 {
  190. if l <= lim[i+1] {
  191. break
  192. }
  193. }
  194. if len(lim) == i || r < lim[i] {
  195. lim = append(lim, 0, 0)
  196. copy(lim[i+2:], lim[i:])
  197. lim[i] = l
  198. lim[i+1] = r
  199. return
  200. }
  201. if l < lim[i] {
  202. lim = append(lim, 0, 0)
  203. copy(lim[i+2:], lim[i:])
  204. lim[i+1] = lim[i] - 1
  205. lim[i] = l
  206. insertLimits(lim[i], r)
  207. return
  208. }
  209. if l > lim[i] {
  210. lim = append(lim, 0, 0)
  211. copy(lim[i+2:], lim[i:])
  212. lim[i+1] = l - 1
  213. lim[i+2] = l
  214. insertLimits(l, r)
  215. return
  216. }
  217. // l == lim[i]
  218. if r == lim[i+1] {
  219. return
  220. }
  221. if r < lim[i+1] {
  222. lim = append(lim, 0, 0)
  223. copy(lim[i+2:], lim[i:])
  224. lim[i] = l
  225. lim[i+1] = r
  226. lim[i+2] = r + 1
  227. return
  228. }
  229. insertLimits(lim[i+1]+1, r)
  230. }
  231. pos := 0
  232. n := 0
  233. newNode := func() *node {
  234. res := new(node)
  235. res.n = n
  236. n++
  237. return res
  238. }
  239. newEdge := func(u, v *node) *edge {
  240. res := new(edge)
  241. res.dst = v
  242. u.e = append(u.e, res)
  243. sort.Sort(u.e)
  244. return res
  245. }
  246. newStartEdge := func(u, v *node) *edge {
  247. res := newEdge(u, v)
  248. res.kind = kStart
  249. return res
  250. }
  251. newEndEdge := func(u, v *node) *edge {
  252. res := newEdge(u, v)
  253. res.kind = kEnd
  254. return res
  255. }
  256. newWildEdge := func(u, v *node) *edge {
  257. res := newEdge(u, v)
  258. res.kind = kWild
  259. return res
  260. }
  261. newRuneEdge := func(u, v *node, r rune) *edge {
  262. res := newEdge(u, v)
  263. res.kind = kRune
  264. res.r = r
  265. sing[r] = true
  266. return res
  267. }
  268. newNilEdge := func(u, v *node) *edge {
  269. res := newEdge(u, v)
  270. res.kind = kNil
  271. return res
  272. }
  273. newClassEdge := func(u, v *node) *edge {
  274. res := newEdge(u, v)
  275. res.kind = kClass
  276. res.lim = make([]rune, 0, 2)
  277. return res
  278. }
  279. maybeEscape := func() rune {
  280. c := s[pos]
  281. if '\\' == c {
  282. pos++
  283. if len(s) == pos {
  284. panic(ErrExtraneousBackslash)
  285. }
  286. c = s[pos]
  287. switch {
  288. case ispunct(c):
  289. case escape(c) >= 0:
  290. c = escape(s[pos])
  291. default:
  292. panic(ErrBadBackslash)
  293. }
  294. }
  295. return c
  296. }
  297. pcharclass := func() (start, end *node) {
  298. start, end = newNode(), newNode()
  299. e := newClassEdge(start, end)
  300. // Ranges consisting of a single element are a special case:
  301. singletonRange := func(c rune) {
  302. // 1. The edge-specific 'lim' field always expects endpoints in pairs,
  303. // so we must give 'c' as the beginning and the end of the range.
  304. e.lim = append(e.lim, c, c)
  305. // 2. Instead of updating the regex-wide 'lim' interval set, we add a singleton.
  306. sing[c] = true
  307. }
  308. if len(s) > pos && '^' == s[pos] {
  309. e.negate = true
  310. pos++
  311. }
  312. var left rune
  313. leftLive := false
  314. justSawDash := false
  315. first := true
  316. // Allow '-' at the beginning and end, and in ranges.
  317. for pos < len(s) && s[pos] != ']' {
  318. switch c := maybeEscape(); c {
  319. case '-':
  320. if first {
  321. singletonRange('-')
  322. break
  323. }
  324. justSawDash = true
  325. default:
  326. if justSawDash {
  327. if !leftLive || left > c {
  328. panic(ErrBadRange)
  329. }
  330. e.lim = append(e.lim, left, c)
  331. if left == c {
  332. sing[c] = true
  333. } else {
  334. insertLimits(left, c)
  335. }
  336. leftLive = false
  337. } else {
  338. if leftLive {
  339. singletonRange(left)
  340. }
  341. left = c
  342. leftLive = true
  343. }
  344. justSawDash = false
  345. }
  346. first = false
  347. pos++
  348. }
  349. if leftLive {
  350. singletonRange(left)
  351. }
  352. if justSawDash {
  353. singletonRange('-')
  354. }
  355. return
  356. }
  357. isNested := false
  358. var pre func() (start, end *node)
  359. pterm := func() (start, end *node) {
  360. if len(s) == pos || s[pos] == '|' {
  361. end = newNode()
  362. start = end
  363. return
  364. }
  365. switch s[pos] {
  366. case '*', '+', '?':
  367. panic(ErrBareClosure)
  368. case ')':
  369. if !isNested {
  370. panic(ErrUnmatchedRpar)
  371. }
  372. end = newNode()
  373. start = end
  374. return
  375. case '(':
  376. pos++
  377. oldIsNested := isNested
  378. isNested = true
  379. start, end = pre()
  380. isNested = oldIsNested
  381. if len(s) == pos || ')' != s[pos] {
  382. panic(ErrUnmatchedLpar)
  383. }
  384. case '.':
  385. start, end = newNode(), newNode()
  386. newWildEdge(start, end)
  387. case '^':
  388. start, end = newNode(), newNode()
  389. newStartEdge(start, end)
  390. case '$':
  391. start, end = newNode(), newNode()
  392. newEndEdge(start, end)
  393. case ']':
  394. panic(ErrUnmatchedRbkt)
  395. case '[':
  396. pos++
  397. start, end = pcharclass()
  398. if len(s) == pos || ']' != s[pos] {
  399. panic(ErrUnmatchedLbkt)
  400. }
  401. default:
  402. start, end = newNode(), newNode()
  403. newRuneEdge(start, end, maybeEscape())
  404. }
  405. pos++
  406. return
  407. }
  408. pclosure := func() (start, end *node) {
  409. start, end = pterm()
  410. if start == end {
  411. return
  412. }
  413. if len(s) == pos {
  414. return
  415. }
  416. switch s[pos] {
  417. case '*':
  418. newNilEdge(end, start)
  419. nend := newNode()
  420. newNilEdge(end, nend)
  421. start, end = end, nend
  422. case '+':
  423. newNilEdge(end, start)
  424. nend := newNode()
  425. newNilEdge(end, nend)
  426. end = nend
  427. case '?':
  428. newNilEdge(start, end)
  429. default:
  430. return
  431. }
  432. pos++
  433. return
  434. }
  435. pcat := func() (start, end *node) {
  436. for {
  437. nstart, nend := pclosure()
  438. if start == nil {
  439. start, end = nstart, nend
  440. } else if nstart != nend {
  441. end.e = make([]*edge, len(nstart.e))
  442. copy(end.e, nstart.e)
  443. end = nend
  444. }
  445. if nstart == nend {
  446. return
  447. }
  448. }
  449. panic("unreachable")
  450. }
  451. pre = func() (start, end *node) {
  452. start, end = pcat()
  453. for pos < len(s) && s[pos] != ')' {
  454. if s[pos] != '|' {
  455. panic(ErrInternal)
  456. }
  457. pos++
  458. nstart, nend := pcat()
  459. tmp := newNode()
  460. newNilEdge(tmp, start)
  461. newNilEdge(tmp, nstart)
  462. start = tmp
  463. tmp = newNode()
  464. newNilEdge(end, tmp)
  465. newNilEdge(nend, tmp)
  466. end = tmp
  467. }
  468. return
  469. }
  470. start, end := pre()
  471. end.accept = true
  472. // Compute shortlist of nodes (reachable nodes), as we may have discarded
  473. // nodes left over from parsing. Also, make short[0] the start node.
  474. short := make([]*node, 0, n)
  475. {
  476. var visit func(*node)
  477. mark := make([]bool, n)
  478. newn := make([]int, n)
  479. visit = func(u *node) {
  480. mark[u.n] = true
  481. newn[u.n] = len(short)
  482. short = append(short, u)
  483. for _, e := range u.e {
  484. if !mark[e.dst.n] {
  485. visit(e.dst)
  486. }
  487. }
  488. }
  489. visit(start)
  490. for _, v := range short {
  491. v.n = newn[v.n]
  492. }
  493. }
  494. n = len(short)
  495. if nfadot != nil {
  496. writeDotGraph(nfadot, start, "NFA_"+x.id)
  497. }
  498. // NFA -> DFA
  499. nilClose := func(st []bool) {
  500. mark := make([]bool, n)
  501. var do func(int)
  502. do = func(i int) {
  503. v := short[i]
  504. for _, e := range v.e {
  505. if e.kind == kNil && !mark[e.dst.n] {
  506. st[e.dst.n] = true
  507. do(e.dst.n)
  508. }
  509. }
  510. }
  511. for i := 0; i < n; i++ {
  512. if st[i] && !mark[i] {
  513. mark[i] = true
  514. do(i)
  515. }
  516. }
  517. }
  518. var todo []*node
  519. tab := make(map[string]*node)
  520. var buf []byte
  521. dfacount := 0
  522. { // Construct the node of no return.
  523. for i := 0; i < n; i++ {
  524. buf = append(buf, '0')
  525. }
  526. tmp := new(node)
  527. tmp.n = -1
  528. tab[string(buf)] = tmp
  529. }
  530. newDFANode := func(st []bool) (res *node, found bool) {
  531. buf = nil
  532. accept := false
  533. for i, v := range st {
  534. if v {
  535. buf = append(buf, '1')
  536. accept = accept || short[i].accept
  537. } else {
  538. buf = append(buf, '0')
  539. }
  540. }
  541. res, found = tab[string(buf)]
  542. if !found {
  543. res = new(node)
  544. res.n = dfacount
  545. res.accept = accept
  546. dfacount++
  547. for i, v := range st {
  548. if v {
  549. res.set = append(res.set, i)
  550. }
  551. }
  552. tab[string(buf)] = res
  553. }
  554. return res, found
  555. }
  556. get := func(states []bool) *node {
  557. nilClose(states)
  558. node, old := newDFANode(states)
  559. if !old {
  560. todo = append(todo, node)
  561. }
  562. return node
  563. }
  564. getcb := func(v *node, cb func(*edge) bool) *node {
  565. states := make([]bool, n)
  566. for _, i := range v.set {
  567. for _, e := range short[i].e {
  568. if cb(e) {
  569. states[e.dst.n] = true
  570. }
  571. }
  572. }
  573. return get(states)
  574. }
  575. states := make([]bool, n)
  576. // The DFA start state is the state representing the nil-closure of the start
  577. // node in the NFA. Recall it has index 0.
  578. states[0] = true
  579. dfastart := get(states)
  580. for len(todo) > 0 {
  581. v := todo[len(todo)-1]
  582. todo = todo[0 : len(todo)-1]
  583. // Singles.
  584. var runes []rune
  585. for r, _ := range sing {
  586. runes = append(runes, r)
  587. }
  588. sort.Sort(RuneSlice(runes))
  589. for _, r := range runes {
  590. newRuneEdge(v, getcb(v, func(e *edge) bool {
  591. return e.kind == kRune && e.r == r ||
  592. e.kind == kWild ||
  593. e.kind == kClass && e.negate != inClass(r, e.lim)
  594. }), r)
  595. }
  596. // Character ranges.
  597. for j := 0; j < len(lim); j += 2 {
  598. e := newClassEdge(v, getcb(v, func(e *edge) bool {
  599. return e.kind == kWild ||
  600. e.kind == kClass && e.negate != inClass(lim[j], e.lim)
  601. }))
  602. e.lim = append(e.lim, lim[j], lim[j+1])
  603. }
  604. // Wild.
  605. newWildEdge(v, getcb(v, func(e *edge) bool {
  606. return e.kind == kWild || (e.kind == kClass && e.negate)
  607. }))
  608. // ^ and $.
  609. newStartEdge(v, getcb(v, func(e *edge) bool { return e.kind == kStart }))
  610. newEndEdge(v, getcb(v, func(e *edge) bool { return e.kind == kEnd }))
  611. }
  612. n = dfacount
  613. if dfadot != nil {
  614. writeDotGraph(dfadot, dfastart, "DFA_"+x.id)
  615. }
  616. // DFA -> Go
  617. sorted := make([]*node, n)
  618. for _, v := range tab {
  619. if -1 != v.n {
  620. sorted[v.n] = v
  621. }
  622. }
  623. fmt.Fprintf(out, "\n// %v\n", string(x.regex))
  624. for i, v := range sorted {
  625. if i == 0 {
  626. out.WriteString("{[]bool{")
  627. } else {
  628. out.WriteString(", ")
  629. }
  630. if v.accept {
  631. out.WriteString("true")
  632. } else {
  633. out.WriteString("false")
  634. }
  635. }
  636. out.WriteString("}, []func(rune) int{ // Transitions\n")
  637. for _, v := range sorted {
  638. out.WriteString("func(r rune) int {\n")
  639. var runeCases, classCases string
  640. var wildDest int
  641. for _, e := range v.e {
  642. m := e.dst.n
  643. switch e.kind {
  644. case kRune:
  645. runeCases += fmt.Sprintf("\t\tcase %d: return %d\n", e.r, m)
  646. case kClass:
  647. classCases += fmt.Sprintf("\t\tcase %d <= r && r <= %d: return %d\n",
  648. e.lim[0], e.lim[1], m)
  649. case kWild:
  650. wildDest = m
  651. }
  652. }
  653. if runeCases != "" {
  654. out.WriteString("\tswitch(r) {\n" + runeCases + "\t}\n")
  655. }
  656. if classCases != "" {
  657. out.WriteString("\tswitch {\n" + classCases + "\t}\n")
  658. }
  659. fmt.Fprintf(out, "\treturn %v\n},\n", wildDest)
  660. }
  661. out.WriteString("}, []int{ /* Start-of-input transitions */ ")
  662. for _, v := range sorted {
  663. s := " -1,"
  664. for _, e := range v.e {
  665. if e.kind == kStart {
  666. s = fmt.Sprintf(" %d,", e.dst.n)
  667. break
  668. }
  669. }
  670. out.WriteString(s)
  671. }
  672. out.WriteString("}, []int{ /* End-of-input transitions */ ")
  673. for _, v := range sorted {
  674. s := " -1,"
  675. for _, e := range v.e {
  676. if e.kind == kEnd {
  677. s = fmt.Sprintf(" %d,", e.dst.n)
  678. break
  679. }
  680. }
  681. out.WriteString(s)
  682. }
  683. out.WriteString("},")
  684. if len(x.kid) == 0 {
  685. out.WriteString("nil")
  686. } else {
  687. out.WriteString("[]dfa{")
  688. for _, kid := range x.kid {
  689. gen(out, kid)
  690. }
  691. out.WriteString("}")
  692. }
  693. out.WriteString("},\n")
  694. }
  695. func writeFamily(out *bufio.Writer, node *rule, lvl int) {
  696. tab := func() {
  697. for i := 0; i <= lvl; i++ {
  698. out.WriteByte('\t')
  699. }
  700. }
  701. if node.startCode != "" {
  702. tab()
  703. prefixReplacer.WriteString(out, "if !yylex.stale {\n")
  704. tab()
  705. out.WriteString("\t" + node.startCode + "\n")
  706. tab()
  707. out.WriteString("}\n")
  708. }
  709. tab()
  710. fmt.Fprintf(out, "OUTER%s%d:\n", node.id, lvl)
  711. tab()
  712. prefixReplacer.WriteString(out,
  713. fmt.Sprintf("for { switch yylex.next(%v) {\n", lvl))
  714. for i, x := range node.kid {
  715. tab()
  716. fmt.Fprintf(out, "\tcase %d:\n", i)
  717. lvl++
  718. if x.kid != nil {
  719. writeFamily(out, x, lvl)
  720. } else {
  721. tab()
  722. out.WriteString("\t" + x.code + "\n")
  723. }
  724. lvl--
  725. }
  726. tab()
  727. out.WriteString("\tdefault:\n")
  728. tab()
  729. fmt.Fprintf(out, "\t\t break OUTER%s%d\n", node.id, lvl)
  730. tab()
  731. out.WriteString("\t}\n")
  732. tab()
  733. out.WriteString("\tcontinue\n")
  734. tab()
  735. out.WriteString("}\n")
  736. tab()
  737. prefixReplacer.WriteString(out, "yylex.pop()\n")
  738. tab()
  739. out.WriteString(node.endCode + "\n")
  740. }
  741. var lexertext = `import ("bufio";"io";"strings")
  742. type frame struct {
  743. i int
  744. s string
  745. line, column int
  746. }
  747. type Lexer struct {
  748. // The lexer runs in its own goroutine, and communicates via channel 'ch'.
  749. ch chan frame
  750. ch_stop chan bool
  751. // We record the level of nesting because the action could return, and a
  752. // subsequent call expects to pick up where it left off. In other words,
  753. // we're simulating a coroutine.
  754. // TODO: Support a channel-based variant that compatible with Go's yacc.
  755. stack []frame
  756. stale bool
  757. // The 'l' and 'c' fields were added for
  758. // https://github.com/wagerlabs/docker/blob/65694e801a7b80930961d70c69cba9f2465459be/buildfile.nex
  759. // Since then, I introduced the built-in Line() and Column() functions.
  760. l, c int
  761. parseResult interface{}
  762. // The following line makes it easy for scripts to insert fields in the
  763. // generated code.
  764. // [NEX_END_OF_LEXER_STRUCT]
  765. }
  766. // NewLexerWithInit creates a new Lexer object, runs the given callback on it,
  767. // then returns it.
  768. func NewLexerWithInit(in io.Reader, initFun func(*Lexer)) *Lexer {
  769. yylex := new(Lexer)
  770. if initFun != nil {
  771. initFun(yylex)
  772. }
  773. yylex.ch = make(chan frame)
  774. yylex.ch_stop = make(chan bool, 1)
  775. var scan func(in *bufio.Reader, ch chan frame, ch_stop chan bool, family []dfa, line, column int)
  776. scan = func(in *bufio.Reader, ch chan frame, ch_stop chan bool, family []dfa, line, column int) {
  777. // Index of DFA and length of highest-precedence match so far.
  778. matchi, matchn := 0, -1
  779. var buf []rune
  780. n := 0
  781. checkAccept := func(i int, st int) bool {
  782. // Higher precedence match? DFAs are run in parallel, so matchn is at most len(buf), hence we may omit the length equality check.
  783. if family[i].acc[st] && (matchn < n || matchi > i) {
  784. matchi, matchn = i, n
  785. return true
  786. }
  787. return false
  788. }
  789. var state [][2]int
  790. for i := 0; i < len(family); i++ {
  791. mark := make([]bool, len(family[i].startf))
  792. // Every DFA starts at state 0.
  793. st := 0
  794. for {
  795. state = append(state, [2]int{i, st})
  796. mark[st] = true
  797. // As we're at the start of input, follow all ^ transitions and append to our list of start states.
  798. st = family[i].startf[st]
  799. if -1 == st || mark[st] { break }
  800. // We only check for a match after at least one transition.
  801. checkAccept(i, st)
  802. }
  803. }
  804. atEOF := false
  805. stopped := false
  806. for {
  807. if n == len(buf) && !atEOF {
  808. r,_,err := in.ReadRune()
  809. switch err {
  810. case io.EOF: atEOF = true
  811. case nil: buf = append(buf, r)
  812. default: panic(err)
  813. }
  814. }
  815. if !atEOF {
  816. r := buf[n]
  817. n++
  818. var nextState [][2]int
  819. for _, x := range state {
  820. x[1] = family[x[0]].f[x[1]](r)
  821. if -1 == x[1] { continue }
  822. nextState = append(nextState, x)
  823. checkAccept(x[0], x[1])
  824. }
  825. state = nextState
  826. } else {
  827. dollar: // Handle $.
  828. for _, x := range state {
  829. mark := make([]bool, len(family[x[0]].endf))
  830. for {
  831. mark[x[1]] = true
  832. x[1] = family[x[0]].endf[x[1]]
  833. if -1 == x[1] || mark[x[1]] { break }
  834. if checkAccept(x[0], x[1]) {
  835. // Unlike before, we can break off the search. Now that we're at the end, there's no need to maintain the state of each DFA.
  836. break dollar
  837. }
  838. }
  839. }
  840. state = nil
  841. }
  842. if state == nil {
  843. lcUpdate := func(r rune) {
  844. if r == '\n' {
  845. line++
  846. column = 0
  847. } else {
  848. column++
  849. }
  850. }
  851. // All DFAs stuck. Return last match if it exists, otherwise advance by one rune and restart all DFAs.
  852. if matchn == -1 {
  853. if len(buf) == 0 { // This can only happen at the end of input.
  854. break
  855. }
  856. lcUpdate(buf[0])
  857. buf = buf[1:]
  858. } else {
  859. text := string(buf[:matchn])
  860. buf = buf[matchn:]
  861. matchn = -1
  862. for {
  863. sent := false
  864. select {
  865. case ch <- frame{matchi, text, line, column}: {
  866. sent = true
  867. }
  868. case stopped = <- ch_stop: {
  869. }
  870. default: {
  871. // nothing
  872. }
  873. }
  874. if stopped||sent {
  875. break
  876. }
  877. }
  878. if stopped {
  879. break
  880. }
  881. if len(family[matchi].nest) > 0 {
  882. scan(bufio.NewReader(strings.NewReader(text)), ch, ch_stop, family[matchi].nest, line, column)
  883. }
  884. if atEOF {
  885. break
  886. }
  887. for _, r := range text {
  888. lcUpdate(r)
  889. }
  890. }
  891. n = 0
  892. for i := 0; i < len(family); i++ {
  893. state = append(state, [2]int{i, 0})
  894. }
  895. }
  896. }
  897. ch <- frame{-1, "", line, column}
  898. }
  899. go scan(bufio.NewReader(in), yylex.ch, yylex.ch_stop, dfas, 0, 0)
  900. return yylex
  901. }
  902. type dfa struct {
  903. acc []bool // Accepting states.
  904. f []func(rune) int // Transitions.
  905. startf, endf []int // Transitions at start and end of input.
  906. nest []dfa
  907. }
  908. var dfas = []dfa{`
  909. var lexeroutro = `}
  910. func NewLexer(in io.Reader) *Lexer {
  911. return NewLexerWithInit(in, nil)
  912. }
  913. func (yyLex *Lexer) Stop() {
  914. yyLex.ch_stop <- true
  915. }
  916. // Text returns the matched text.
  917. func (yylex *Lexer) Text() string {
  918. return yylex.stack[len(yylex.stack) - 1].s
  919. }
  920. // Line returns the current line number.
  921. // The first line is 0.
  922. func (yylex *Lexer) Line() int {
  923. if len(yylex.stack) == 0 {
  924. return 0
  925. }
  926. return yylex.stack[len(yylex.stack) - 1].line
  927. }
  928. // Column returns the current column number.
  929. // The first column is 0.
  930. func (yylex *Lexer) Column() int {
  931. if len(yylex.stack) == 0 {
  932. return 0
  933. }
  934. return yylex.stack[len(yylex.stack) - 1].column
  935. }
  936. func (yylex *Lexer) next(lvl int) int {
  937. if lvl == len(yylex.stack) {
  938. l, c := 0, 0
  939. if lvl > 0 {
  940. l, c = yylex.stack[lvl - 1].line, yylex.stack[lvl - 1].column
  941. }
  942. yylex.stack = append(yylex.stack, frame{0, "", l, c})
  943. }
  944. if lvl == len(yylex.stack) - 1 {
  945. p := &yylex.stack[lvl]
  946. *p = <-yylex.ch
  947. yylex.stale = false
  948. } else {
  949. yylex.stale = true
  950. }
  951. return yylex.stack[lvl].i
  952. }
  953. func (yylex *Lexer) pop() {
  954. yylex.stack = yylex.stack[:len(yylex.stack) - 1]
  955. }
  956. `
  957. func writeLex(out *bufio.Writer, root rule) {
  958. if !customError {
  959. // TODO: I can't remember what this was for!
  960. prefixReplacer.WriteString(out, `func (yylex Lexer) Error(e string) {
  961. panic(e)
  962. }`)
  963. }
  964. prefixReplacer.WriteString(out, `
  965. // Lex runs the lexer. Always returns 0.
  966. // When the -s option is given, this function is not generated;
  967. // instead, the NN_FUN macro runs the lexer.
  968. func (yylex *Lexer) Lex(lval *yySymType) int {
  969. `)
  970. writeFamily(out, &root, 0)
  971. out.WriteString("\treturn 0\n}\n")
  972. }
  973. func writeNNFun(out *bufio.Writer, root rule) {
  974. prefixReplacer.WriteString(out, "func(yylex *Lexer) {\n")
  975. writeFamily(out, &root, 0)
  976. out.WriteString("}")
  977. }
  978. func process(output io.Writer, input io.Reader) error {
  979. lineno := 1
  980. in := bufio.NewReader(input)
  981. out := bufio.NewWriter(output)
  982. var r rune
  983. read := func() bool {
  984. var err error
  985. r, _, err = in.ReadRune()
  986. if err == io.EOF {
  987. return true
  988. }
  989. if err != nil {
  990. panic(err)
  991. }
  992. if r == '\n' {
  993. lineno++
  994. }
  995. return false
  996. }
  997. skipws := func() bool {
  998. for !read() {
  999. if strings.IndexRune(" \n\t\r", r) == -1 {
  1000. return false
  1001. }
  1002. }
  1003. return true
  1004. }
  1005. var buf []rune
  1006. readCode := func() string {
  1007. if '{' != r {
  1008. panic(ErrExpectedLBrace)
  1009. }
  1010. buf = []rune{r}
  1011. nesting := 1
  1012. for {
  1013. if read() {
  1014. panic(ErrUnmatchedLBrace)
  1015. }
  1016. buf = append(buf, r)
  1017. if '{' == r {
  1018. nesting++
  1019. } else if '}' == r {
  1020. nesting--
  1021. if 0 == nesting {
  1022. break
  1023. }
  1024. }
  1025. }
  1026. return string(buf)
  1027. }
  1028. var root rule
  1029. needRootRAngle := false
  1030. var parse func(*rule) error
  1031. parse = func(node *rule) error {
  1032. for {
  1033. panicIf(skipws, ErrUnexpectedEOF)
  1034. if '<' == r {
  1035. if node != &root || len(node.kid) > 0 {
  1036. panic(ErrUnexpectedLAngle)
  1037. }
  1038. panicIf(skipws, ErrUnexpectedEOF)
  1039. node.startCode = readCode()
  1040. needRootRAngle = true
  1041. continue
  1042. } else if '>' == r {
  1043. if node == &root {
  1044. if !needRootRAngle {
  1045. panic(ErrUnmatchedRAngle)
  1046. }
  1047. }
  1048. if skipws() {
  1049. return ErrUnexpectedEOF
  1050. }
  1051. node.endCode = readCode()
  1052. return nil
  1053. }
  1054. delim := r
  1055. panicIf(read, ErrUnexpectedEOF)
  1056. var regex []rune
  1057. for {
  1058. if r == delim && (len(regex) == 0 || regex[len(regex)-1] != '\\') {
  1059. break
  1060. }
  1061. if '\n' == r {
  1062. return ErrUnexpectedNewline
  1063. }
  1064. regex = append(regex, r)
  1065. panicIf(read, ErrUnexpectedEOF)
  1066. }
  1067. if "" == string(regex) {
  1068. break
  1069. }
  1070. panicIf(skipws, ErrUnexpectedEOF)
  1071. x := new(rule)
  1072. x.id = fmt.Sprintf("%d", lineno)
  1073. node.kid = append(node.kid, x)
  1074. x.regex = make([]rune, len(regex))
  1075. copy(x.regex, regex)
  1076. if '<' == r {
  1077. panicIf(skipws, ErrUnexpectedEOF)
  1078. x.startCode = readCode()
  1079. parse(x)
  1080. } else {
  1081. x.code = readCode()
  1082. }
  1083. }
  1084. return nil
  1085. }
  1086. err := parse(&root)
  1087. if err != nil {
  1088. return err
  1089. }
  1090. buf = nil
  1091. for done := skipws(); !done; done = read() {
  1092. buf = append(buf, r)
  1093. }
  1094. fs := token.NewFileSet()
  1095. // Append a blank line to make things easier when there are only package and
  1096. // import declarations.
  1097. t, err := parser.ParseFile(fs, "", string(buf)+"\n", parser.ImportsOnly)
  1098. if err != nil {
  1099. panic(err)
  1100. }
  1101. printer.Fprint(out, fs, t)
  1102. var file *token.File
  1103. fs.Iterate(func(f *token.File) bool {
  1104. file = f
  1105. return true
  1106. })
  1107. // Skip over package and import declarations. This is why we appended a blank
  1108. // line above.
  1109. for m := file.LineCount(); m > 1; m-- {
  1110. i := 0
  1111. for '\n' != buf[i] {
  1112. i++
  1113. }
  1114. buf = buf[i+1:]
  1115. }
  1116. prefixReplacer.WriteString(out, lexertext)
  1117. for _, kid := range root.kid {
  1118. gen(out, kid)
  1119. }
  1120. prefixReplacer.WriteString(out, lexeroutro)
  1121. if !standalone {
  1122. writeLex(out, root)
  1123. out.WriteString(string(buf))
  1124. out.Flush()
  1125. if len(outFilename) > 0 {
  1126. gofmt()
  1127. }
  1128. return nil
  1129. }
  1130. m := 0
  1131. const funmac = "NN_FUN"
  1132. for m < len(buf) {
  1133. m++
  1134. if funmac[:m] != string(buf[:m]) {
  1135. out.WriteString(string(buf[:m]))
  1136. buf = buf[m:]
  1137. m = 0
  1138. } else if funmac == string(buf[:m]) {
  1139. writeNNFun(out, root)
  1140. buf = buf[m:]
  1141. m = 0
  1142. }
  1143. }
  1144. out.WriteString(string(buf))
  1145. out.Flush()
  1146. if len(outFilename) > 0 {
  1147. gofmt()
  1148. }
  1149. return nil
  1150. }
  1151. func gofmt() {
  1152. src, err := ioutil.ReadFile(outFilename)
  1153. if err != nil {
  1154. return
  1155. }
  1156. src, err = format.Source(src)
  1157. if err != nil {
  1158. return
  1159. }
  1160. ioutil.WriteFile(outFilename, src, 0666)
  1161. }
  1162. func panicIf(f func() bool, err error) {
  1163. if f() {
  1164. panic(err)
  1165. }
  1166. }
  1167. func dieIf(cond bool, v ...interface{}) {
  1168. if cond {
  1169. log.Fatal(v...)
  1170. }
  1171. }
  1172. func dieErr(err error, s string) {
  1173. if err != nil {
  1174. log.Fatalf("%v: %v", s, err)
  1175. }
  1176. }
  1177. func createDotFile(filename string) *os.File {
  1178. if filename == "" {
  1179. return nil
  1180. }
  1181. suf := strings.HasSuffix(filename, ".nex")
  1182. dieIf(suf, "nex: DOT filename ends with .nex:", filename)
  1183. file, err := os.Create(filename)
  1184. dieErr(err, "Create")
  1185. return file
  1186. }