hotime/vendor/go.mongodb.org/mongo-driver/bson/bsonrw/json_scanner.go

// Copyright (C) MongoDB, Inc. 2017-present.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may
// not use this file except in compliance with the License. You may obtain
// a copy of the License at http://www.apache.org/licenses/LICENSE-2.0

package bsonrw

import (
	"bytes"
	"errors"
	"fmt"
	"io"
	"math"
	"strconv"
	"unicode"
	"unicode/utf16"
)

type jsonTokenType byte

const (
	jttBeginObject jsonTokenType = iota
	jttEndObject
	jttBeginArray
	jttEndArray
	jttColon
	jttComma
	jttInt32
	jttInt64
	jttDouble
	jttString
	jttBool
	jttNull
	jttEOF
)

type jsonToken struct {
	t jsonTokenType
	v interface{}
	p int
}

type jsonScanner struct {
	r           io.Reader
	buf         []byte
	pos         int
	lastReadErr error
}

// nextToken returns the next JSON token if one exists. A token is a character
// of the JSON grammar, a number, a string, or a literal.
func (js *jsonScanner) nextToken() (*jsonToken, error) {
	c, err := js.readNextByte()

	// keep reading until a non-space is encountered (break on read error or EOF)
	for isWhiteSpace(c) && err == nil {
		c, err = js.readNextByte()
	}

	if err == io.EOF {
		return &jsonToken{t: jttEOF}, nil
	} else if err != nil {
		return nil, err
	}

	// switch on the character
	switch c {
	case '{':
		return &jsonToken{t: jttBeginObject, v: byte('{'), p: js.pos - 1}, nil
	case '}':
		return &jsonToken{t: jttEndObject, v: byte('}'), p: js.pos - 1}, nil
	case '[':
		return &jsonToken{t: jttBeginArray, v: byte('['), p: js.pos - 1}, nil
	case ']':
		return &jsonToken{t: jttEndArray, v: byte(']'), p: js.pos - 1}, nil
	case ':':
		return &jsonToken{t: jttColon, v: byte(':'), p: js.pos - 1}, nil
	case ',':
		return &jsonToken{t: jttComma, v: byte(','), p: js.pos - 1}, nil
	case '"': // RFC-8259 only allows for double quotes (") not single (')
		return js.scanString()
	default:
		// check if it's a number
		if c == '-' || isDigit(c) {
			return js.scanNumber(c)
		} else if c == 't' || c == 'f' || c == 'n' {
			// maybe a literal
			return js.scanLiteral(c)
		} else {
			return nil, fmt.Errorf("invalid JSON input. Position: %d. Character: %c", js.pos-1, c)
		}
	}
}

// readNextByte attempts to read the next byte from the buffer. If the buffer
// has been exhausted, this function calls readIntoBuf, thus refilling the
// buffer and resetting the read position to 0
func (js *jsonScanner) readNextByte() (byte, error) {
	if js.pos >= len(js.buf) {
		err := js.readIntoBuf()

		if err != nil {
			return 0, err
		}
	}

	b := js.buf[js.pos]
	js.pos++

	return b, nil
}

// readNNextBytes reads n bytes into dst, starting at offset
func (js *jsonScanner) readNNextBytes(dst []byte, n, offset int) error {
	var err error

	for i := 0; i < n; i++ {
		dst[i+offset], err = js.readNextByte()
		if err != nil {
			return err
		}
	}

	return nil
}

// readIntoBuf reads up to 512 bytes from the scanner's io.Reader into the buffer
func (js *jsonScanner) readIntoBuf() error {
	if js.lastReadErr != nil {
		js.buf = js.buf[:0]
		js.pos = 0
		return js.lastReadErr
	}

	if cap(js.buf) == 0 {
		js.buf = make([]byte, 0, 512)
	}

	n, err := js.r.Read(js.buf[:cap(js.buf)])
	if err != nil {
		js.lastReadErr = err
		if n > 0 {
			err = nil
		}
	}
	js.buf = js.buf[:n]
	js.pos = 0

	return err
}

func isWhiteSpace(c byte) bool {
	return c == ' ' || c == '\t' || c == '\r' || c == '\n'
}

func isDigit(c byte) bool {
	return unicode.IsDigit(rune(c))
}

func isValueTerminator(c byte) bool {
	return c == ',' || c == '}' || c == ']' || isWhiteSpace(c)
}

// getu4 decodes the 4-byte hex sequence from the beginning of s, returning the hex value as a rune,
// or it returns -1. Note that the "\u" from the unicode escape sequence should not be present.
// It is copied and lightly modified from the Go JSON decode function at
// https://github.com/golang/go/blob/1b0a0316802b8048d69da49dc23c5a5ab08e8ae8/src/encoding/json/decode.go#L1169-L1188
func getu4(s []byte) rune {
	if len(s) < 4 {
		return -1
	}
	var r rune
	for _, c := range s[:4] {
		switch {
		case '0' <= c && c <= '9':
			c = c - '0'
		case 'a' <= c && c <= 'f':
			c = c - 'a' + 10
		case 'A' <= c && c <= 'F':
			c = c - 'A' + 10
		default:
			return -1
		}
		r = r*16 + rune(c)
	}
	return r
}

// scanString reads from an opening '"' to a closing '"' and handles escaped characters
func (js *jsonScanner) scanString() (*jsonToken, error) {
	var b bytes.Buffer
	var c byte
	var err error

	p := js.pos - 1

	for {
		c, err = js.readNextByte()
		if err != nil {
			if err == io.EOF {
				return nil, errors.New("end of input in JSON string")
			}
			return nil, err
		}

	evalNextChar:
		switch c {
		case '\\':
			c, err = js.readNextByte()
			if err != nil {
				if err == io.EOF {
					return nil, errors.New("end of input in JSON string")
				}
				return nil, err
			}

		evalNextEscapeChar:
			switch c {
			case '"', '\\', '/':
				b.WriteByte(c)
			case 'b':
				b.WriteByte('\b')
			case 'f':
				b.WriteByte('\f')
			case 'n':
				b.WriteByte('\n')
			case 'r':
				b.WriteByte('\r')
			case 't':
				b.WriteByte('\t')
			case 'u':
				us := make([]byte, 4)
				err = js.readNNextBytes(us, 4, 0)
				if err != nil {
					return nil, fmt.Errorf("invalid unicode sequence in JSON string: %s", us)
				}

				rn := getu4(us)

				// If the rune we just decoded is the high or low value of a possible surrogate pair,
				// try to decode the next sequence as the low value of a surrogate pair. We're
				// expecting the next sequence to be another Unicode escape sequence (e.g. "\uDD1E"),
				// but need to handle cases where the input is not a valid surrogate pair.
				// For more context on unicode surrogate pairs, see:
				// https://www.christianfscott.com/rust-chars-vs-go-runes/
				// https://www.unicode.org/glossary/#high_surrogate_code_point
				if utf16.IsSurrogate(rn) {
					c, err = js.readNextByte()
					if err != nil {
						if err == io.EOF {
							return nil, errors.New("end of input in JSON string")
						}
						return nil, err
					}

					// If the next value isn't the beginning of a backslash escape sequence, write
					// the Unicode replacement character for the surrogate value and goto the
					// beginning of the next char eval block.
					if c != '\\' {
						b.WriteRune(unicode.ReplacementChar)
						goto evalNextChar
					}

					c, err = js.readNextByte()
					if err != nil {
						if err == io.EOF {
							return nil, errors.New("end of input in JSON string")
						}
						return nil, err
					}

					// If the next value isn't the beginning of a unicode escape sequence, write the
					// Unicode replacement character for the surrogate value and goto the beginning
					// of the next escape char eval block.
					if c != 'u' {
						b.WriteRune(unicode.ReplacementChar)
						goto evalNextEscapeChar
					}

					err = js.readNNextBytes(us, 4, 0)
					if err != nil {
						return nil, fmt.Errorf("invalid unicode sequence in JSON string: %s", us)
					}

					rn2 := getu4(us)

					// Try to decode the pair of runes as a utf16 surrogate pair. If that fails, write
					// the Unicode replacement character for the surrogate value and the 2nd decoded rune.
					if rnPair := utf16.DecodeRune(rn, rn2); rnPair != unicode.ReplacementChar {
						b.WriteRune(rnPair)
					} else {
						b.WriteRune(unicode.ReplacementChar)
						b.WriteRune(rn2)
					}

					break
				}

				b.WriteRune(rn)
			default:
				return nil, fmt.Errorf("invalid escape sequence in JSON string '\\%c'", c)
			}
		case '"':
			return &jsonToken{t: jttString, v: b.String(), p: p}, nil
		default:
			b.WriteByte(c)
		}
	}
}

// scanLiteral reads an unquoted sequence of characters and determines if it is one of
// three valid JSON literals (true, false, null); if so, it returns the appropriate
// jsonToken; otherwise, it returns an error
func (js *jsonScanner) scanLiteral(first byte) (*jsonToken, error) {
	p := js.pos - 1

	lit := make([]byte, 4)
	lit[0] = first

	err := js.readNNextBytes(lit, 3, 1)
	if err != nil {
		return nil, err
	}

	c5, err := js.readNextByte()

	if bytes.Equal([]byte("true"), lit) && (isValueTerminator(c5) || err == io.EOF) {
		js.pos = int(math.Max(0, float64(js.pos-1)))
		return &jsonToken{t: jttBool, v: true, p: p}, nil
	} else if bytes.Equal([]byte("null"), lit) && (isValueTerminator(c5) || err == io.EOF) {
		js.pos = int(math.Max(0, float64(js.pos-1)))
		return &jsonToken{t: jttNull, v: nil, p: p}, nil
	} else if bytes.Equal([]byte("fals"), lit) {
		if c5 == 'e' {
			c5, err = js.readNextByte()

			if isValueTerminator(c5) || err == io.EOF {
				js.pos = int(math.Max(0, float64(js.pos-1)))
				return &jsonToken{t: jttBool, v: false, p: p}, nil
			}
		}
	}

	return nil, fmt.Errorf("invalid JSON literal. Position: %d, literal: %s", p, lit)
}

type numberScanState byte

const (
	nssSawLeadingMinus numberScanState = iota
	nssSawLeadingZero
	nssSawIntegerDigits
	nssSawDecimalPoint
	nssSawFractionDigits
	nssSawExponentLetter
	nssSawExponentSign
	nssSawExponentDigits
	nssDone
	nssInvalid
)

// scanNumber reads a JSON number (according to RFC-8259)
func (js *jsonScanner) scanNumber(first byte) (*jsonToken, error) {
	var b bytes.Buffer
	var s numberScanState
	var c byte
	var err error

	t := jttInt64 // assume it's an int64 until the type can be determined
	start := js.pos - 1

	b.WriteByte(first)

	switch first {
	case '-':
		s = nssSawLeadingMinus
	case '0':
		s = nssSawLeadingZero
	default:
		s = nssSawIntegerDigits
	}

	for {
		c, err = js.readNextByte()

		if err != nil && err != io.EOF {
			return nil, err
		}

		switch s {
		case nssSawLeadingMinus:
			switch c {
			case '0':
				s = nssSawLeadingZero
				b.WriteByte(c)
			default:
				if isDigit(c) {
					s = nssSawIntegerDigits
					b.WriteByte(c)
				} else {
					s = nssInvalid
				}
			}
		case nssSawLeadingZero:
			switch c {
			case '.':
				s = nssSawDecimalPoint
				b.WriteByte(c)
			case 'e', 'E':
				s = nssSawExponentLetter
				b.WriteByte(c)
			case '}', ']', ',':
				s = nssDone
			default:
				if isWhiteSpace(c) || err == io.EOF {
					s = nssDone
				} else {
					s = nssInvalid
				}
			}
		case nssSawIntegerDigits:
			switch c {
			case '.':
				s = nssSawDecimalPoint
				b.WriteByte(c)
			case 'e', 'E':
				s = nssSawExponentLetter
				b.WriteByte(c)
			case '}', ']', ',':
				s = nssDone
			default:
				if isWhiteSpace(c) || err == io.EOF {
					s = nssDone
				} else if isDigit(c) {
					s = nssSawIntegerDigits
					b.WriteByte(c)
				} else {
					s = nssInvalid
				}
			}
		case nssSawDecimalPoint:
			t = jttDouble
			if isDigit(c) {
				s = nssSawFractionDigits
				b.WriteByte(c)
			} else {
				s = nssInvalid
			}
		case nssSawFractionDigits:
			switch c {
			case 'e', 'E':
				s = nssSawExponentLetter
				b.WriteByte(c)
			case '}', ']', ',':
				s = nssDone
			default:
				if isWhiteSpace(c) || err == io.EOF {
					s = nssDone
				} else if isDigit(c) {
					s = nssSawFractionDigits
					b.WriteByte(c)
				} else {
					s = nssInvalid
				}
			}
		case nssSawExponentLetter:
			t = jttDouble
			switch c {
			case '+', '-':
				s = nssSawExponentSign
				b.WriteByte(c)
			default:
				if isDigit(c) {
					s = nssSawExponentDigits
					b.WriteByte(c)
				} else {
					s = nssInvalid
				}
			}
		case nssSawExponentSign:
			if isDigit(c) {
				s = nssSawExponentDigits
				b.WriteByte(c)
			} else {
				s = nssInvalid
			}
		case nssSawExponentDigits:
			switch c {
			case '}', ']', ',':
				s = nssDone
			default:
				if isWhiteSpace(c) || err == io.EOF {
					s = nssDone
				} else if isDigit(c) {
					s = nssSawExponentDigits
					b.WriteByte(c)
				} else {
					s = nssInvalid
				}
			}
		}

		switch s {
		case nssInvalid:
			return nil, fmt.Errorf("invalid JSON number. Position: %d", start)
		case nssDone:
			js.pos = int(math.Max(0, float64(js.pos-1)))
			if t != jttDouble {
				v, err := strconv.ParseInt(b.String(), 10, 64)
				if err == nil {
					if v < math.MinInt32 || v > math.MaxInt32 {
						return &jsonToken{t: jttInt64, v: v, p: start}, nil
					}

					return &jsonToken{t: jttInt32, v: int32(v), p: start}, nil
				}
			}

			v, err := strconv.ParseFloat(b.String(), 64)
			if err != nil {
				return nil, err
			}

			return &jsonToken{t: jttDouble, v: v, p: start}, nil
		}
	}
}