Add some scripts for getting character usage info

This commit is contained in:
Tellow Krinkle
2018-09-07 04:18:05 -05:00
parent 7ca3d475e0
commit a28e0cce65
7 changed files with 642 additions and 0 deletions

View File

@@ -0,0 +1,315 @@
//
// Scanner.swift
// CParser_CS440
//
import Foundation
enum TokenType: String {
case stringLiteral, characterLiteral, punctuation, identifier
}
enum TokenizationError: Error {
case badPunctuation(row: Int, column: Int, character: UnicodeScalar)
case unclosedString(row: Int, column: Int, string: String)
}
struct TokenListSignature: Hashable, Equatable {
let list: [TokenType]
init(_ array: [TokenType]) {
self.list = array
}
init(from tokens: [Token]) {
list = tokens.map { $0.type }
}
static func ==(lhs: TokenListSignature, rhs: TokenListSignature) -> Bool {
guard lhs.list.count == rhs.list.count else { return false }
for tokentype in lhs.list.enumerated() {
if rhs.list[tokentype.offset] != tokentype.element {
return false
}
}
return true
}
var hashValue: Int {
return list.map({ $0.hashValue }).reduce(5381) {
($0 << 5) &+ $0 &+ $1
}
}
}
struct MovingStringRange {
let string: String.UnicodeScalarView
private(set) var back: String.UnicodeScalarIndex
private(set) var front: String.UnicodeScalarIndex
private(set) var length: Int
private(set) var backRow: Int
private(set) var backColumn: Int
private(set) var frontRow: Int
private(set) var frontColumn: Int
init(_ string: String.UnicodeScalarView, atEnd: Bool = false) {
self.string = string
self.length = 0
self.back = atEnd ? string.endIndex : string.startIndex
self.front = back
self.backRow = 1
self.backColumn = 1
self.frontRow = 1
self.frontColumn = 1
}
init(_ string: String, atEnd: Bool = false) {
self.init(string.unicodeScalars, atEnd: atEnd)
}
private mutating func advanceFront() {
if frontChar == "\n" {
frontColumn = 1
frontRow += 1
}
else {
frontColumn += 1
}
front = string.index(after: front)
length += 1
}
private mutating func retreatFront() {
front = string.index(before: front)
length -= 1
if frontChar == "\n" {
var check = string.index(before: front)
frontColumn = 2
while check >= string.startIndex && string[check] != "\n" {
frontColumn += 1
check = string.index(before: check)
}
frontRow -= 1
}
else {
frontColumn -= 1
}
}
mutating func advanceFront(by: Int = 1) {
if by > 0 {
for _ in 0..<by {
advanceFront()
}
}
else {
for _ in 0..<(-by) {
retreatFront()
}
}
}
private mutating func advanceBack() {
if backChar == "\n" {
backColumn = 1
backRow += 1
}
else {
backColumn += 1
}
back = string.index(after: back)
length += 1
}
private mutating func retreatBack() {
back = string.index(before: back)
length -= 1
if backChar == "\n" {
var check = string.index(before: back)
backColumn = 2
while check >= string.startIndex && string[check] != "\n" {
backColumn += 1
check = string.index(before: check)
}
backRow -= 1
}
else {
backColumn -= 1
}
}
mutating func advanceBack(by: Int = 1) {
if by > 0 {
for _ in 0..<by {
advanceBack()
}
}
else {
for _ in 0..<(-by) {
retreatBack()
}
}
}
mutating func setBackToFront() {
back = front
backColumn = frontColumn
backRow = frontRow
length = 0
}
mutating func setFrontToBack() {
front = back
frontColumn = backColumn
frontRow = backRow
length = 0
}
var currentRange: String {
return String(string[back..<front])
}
var frontChar: UnicodeScalar {
return string[front]
}
var backChar: UnicodeScalar {
return string[back]
}
var backIsBeginning: Bool {
return back <= string.startIndex
}
var backIsEnd: Bool {
return back >= string.endIndex
}
var frontIsBeginning: Bool {
return front <= string.startIndex
}
var frontIsEnd: Bool {
return front >= string.endIndex
}
}
extension UnicodeScalar {
var isNewline: Bool {
return (0x0a...0x0d).contains(self.value) || self.value == 0x85 || self.value == 0x2028 || self.value == 0x2029
}
var isWhitespace: Bool {
return self.value == 0x20 || self.value == 0xa0 || self.value == 0x1680 || (0x2000...0x200a).contains(self.value) || self.value == 0x202f || self.value == 0x205f || self.value == 0x3000
}
var isNewlineOrWhitespace: Bool {
return isNewline || isWhitespace
}
}
struct Token: CustomStringConvertible {
let type: TokenType
let value: String
let row: Int
let column: Int
init(type: TokenType, value: String, row: Int, column: Int) {
self.type = type
self.value = value
self.row = row
self.column = column
}
init(type: TokenType, value: String.UnicodeScalarView, row: Int, column: Int) {
self.init(type: type, value: String(value), row: row, column: column)
}
var description: String {
// return "[\(type) \(value)]"
switch type {
case .identifier, .punctuation:
return value
case .stringLiteral:
return "\"\(value)\""
case .characterLiteral:
return "'\(value)'"
}
}
static func tokenize(input: String) throws -> [Token] {
var inputRange = MovingStringRange(input)
var tokens: [Token] = []
while !inputRange.backIsEnd {
//print("Currently looking from row \(inputRange.backRow) column \(inputRange.backColumn) to row \(inputRange.frontRow) column \(inputRange.frontColumn), \(inputRange.currentRange)")
if inputRange.frontIsEnd { // If this is the end of the file
if (inputRange.length > 0) {
tokens.append(Token(type: .identifier, value: inputRange.currentRange, row: inputRange.backRow, column: inputRange.backColumn))
}
inputRange.setBackToFront()
continue
}
else if inputRange.frontChar.isNewlineOrWhitespace { // Whitespace, end of token
if inputRange.length > 0 { // If there's multiple whitespace chars in a row, don't add empty tokens
tokens.append(Token(type: .identifier, value: inputRange.currentRange, row: inputRange.backRow, column: inputRange.backColumn))
}
inputRange.advanceFront()
inputRange.setBackToFront()
}
else if punctuationCharacters.contains(inputRange.frontChar) {
if inputRange.length > 0 { // Add the previous identifier if it exists
tokens.append(Token(type: .identifier, value: inputRange.currentRange, row: inputRange.backRow, column: inputRange.backColumn))
}
inputRange.setBackToFront()
while !inputRange.frontIsEnd && (punctuationCharacters.contains(inputRange.frontChar) || inputRange.length > 0) {
// Keep going until we reach the end of the file and have parsed it all
inputRange.advanceFront()
if inputRange.length > longestPunctuation || inputRange.frontIsEnd || !punctuationCharacters.contains(inputRange.frontChar) {
var punctuationToken = inputRange.currentRange
while inputRange.length > 1 && !allPunctuation.contains(punctuationToken) {
inputRange.advanceFront(by: -1)
punctuationToken = inputRange.currentRange
}
if commentPunctuation.contains(punctuationToken) {
if punctuationToken == "//" {
while !inputRange.frontIsEnd && !inputRange.frontChar.isNewline {
inputRange.advanceFront()
}
}
else {
while !inputRange.frontIsEnd && inputRange.frontChar != "/" {
while !inputRange.frontIsEnd && inputRange.frontChar != "*" {
inputRange.advanceFront()
}
inputRange.advanceFront()
}
}
inputRange.advanceFront()
inputRange.setBackToFront()
continue
}
if allPunctuation.contains(punctuationToken) {
tokens.append(Token(type: .punctuation, value: punctuationToken, row: inputRange.backRow, column: inputRange.backColumn))
inputRange.setBackToFront()
}
else {
throw TokenizationError.badPunctuation(row: inputRange.backRow, column: inputRange.backColumn, character: inputRange.backChar)
}
}
}
}
else if inputRange.frontChar == "\"" || inputRange.frontChar == "'" {
let quoteType = inputRange.frontChar
if inputRange.length > 0 { // Add the previous identifier if it exists
tokens.append(Token(type: .identifier, value: inputRange.currentRange, row: inputRange.backRow, column: inputRange.backColumn))
}
inputRange.advanceFront()
inputRange.setBackToFront()
while true {
if inputRange.frontIsEnd || inputRange.frontChar.isNewline {
inputRange.advanceBack(by: -1)
throw TokenizationError.unclosedString(row: inputRange.backRow, column: inputRange.backColumn, string: inputRange.currentRange)
}
else if inputRange.frontChar == "\\" {
inputRange.advanceFront(by: 2)
}
else if inputRange.frontChar == quoteType {
let type: TokenType
if quoteType == "'" {
type = .characterLiteral
}
else {
type = .stringLiteral
}
tokens.append(Token(type: type, value: inputRange.currentRange, row: inputRange.backRow, column: inputRange.backColumn - 1))
inputRange.advanceFront()
inputRange.setBackToFront()
break
}
else {
inputRange.advanceFront()
}
}
}
else {
inputRange.advanceFront()
}
}
return tokens
}
}