[bb] Blitz Lexical Analyser by Steve Hill [ 1+ years ago ]

Started by BlitzBot, June 29, 2017, 00:28:40

Previous topic - Next topic

BlitzBot

Title : Blitz Lexical Analyser
Author : Steve Hill
Posted : 1+ years ago

Description : This code splits a Blitz source file into tokens.

Each token represents items like operators, keywords, comments, end of line, functions etc.

Useful if you want to write a pre-processor or other tool that manipulates Blitz code files.

Contains an example for just printing the tokens in a file.


Code :
Code (blitzbasic) Select
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;
; Lexer.bb
;
; Tokenises Blitz Basic code
;
; Steve Hill, 2003
;
; OpenState(fileName$) - creates a new TState
; CloseState()         - destroys and closes the current state
; GetToken(state)      - read the next token
;
; The current token is available in state ok$
;
; Versions
; 0.1 Initial version 27 Aug 2003
; 0.2 Added >< => =< 29 Aug 2003
;

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; CONSTANTS
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

; Constants for various character types
;
Const  SPACE        = 32
Const  TAB = 9
Const  CR = 13
Const  LF           = 10

Global WHITE_SPACE$ = Chr$(SPACE) + Chr$(TAB) + Chr$(CR)
Const  ALPHA$       = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
Const  DIGITS$      = "0123456789"
Const  HEXDIGITS$   = "0123456789abcdefABCDEF"
Const  BINDIGITS$   = "01"
Const  DELIM$ = "^*+-~<>/#%.$()[],=:"
Global QUOTE$ = Chr$(34)
Global ALPHANUM$ = ALPHA$ + DIGITS$

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; TYPES
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

; TState
;
; Keeps track of the current file, look-ahead character and token.
; Can be used as a stack for include files.
;
Type TState
Field file
Field ch$
Field tok$
Field tokType
Field lineNum
; Field charNum
End Type

Const TOK_WORD = 0
Const TOK_OPERATOR = 1
Const TOK_COMMENT = 2
Const TOK_DEC_NUMBER = 3
Const TOK_HEX_NUMBER = 4
Const TOK_BIN_NUMBER = 5
Const TOK_STRING = 6
Const TOK_EOL = 7
Const TOK_EOF = 8
Const TOK_UNKNOWN = 9

; TDescriptor
;
; Describes a function: name, return type and parameter type list.
; Assigned unique id for each functions ... its "pointer"
;
Type TDescriptor
Field name$
Field typ$
Field params$
Field id
End Type

; Error
;
; Something has gone wrong bail out.
;
Function Error(e$, state.TState)
Print e$
If statefile <> 0
Print "Error on line " + Str$(statelineNum)
EndIf
Print "Press a key"
WaitKey
End
End Function

; OpenState
;
; Open the file initialise the fields
;
Function OpenState(name$)
Print "Parsing " + name$
state.TState = New TState
statefile = ReadFile(name$)
If statefile = 0 Then
Error("File " + name$ + "not found", state)
EndIf
statelineNum = 1
statech$ = ""
state ok$ = ""

GetToken(state)
End Function

; CloseState
;
; Close current file, pop state
;
Function CloseState()
For state.TState = Each TState
CloseFile(statefile)
Next

Delete Each TState
End Function

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; LEXICAL FUNCTIONS
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

; GetChar
;
; Read a character from the current file into the state
;
Function GetChar(state.TState)
If statech$ = Chr$(LF)
statelineNum = stateLineNum + 1
EndIf

If Eof(statefile) Then
statech$ = ""
Return
EndIf

statech$ = Chr$(ReadByte(statefile))
End Function

; SkipSpace
;
; Skip white space
;
Function SkipSpace(state.TState)
While Instr(WHITE_SPACE$, statech$) <> 0
GetChar(state)
If statech$ = "" Return
Wend
End Function

; GetFollowing
;
; Generic token reader, reads while characters are
; those in pat.  Places token in state.
;
Function GetFollowing(state.TState, pat$)
tok$ = statech$
GetChar(state)
While Instr(pat$, statech$) <> 0
If statech$ = "" Exit
tok$ = tok$ + statech$
GetChar(state)
Wend
state ok$ = tok$
End Function

; GetDecNumber
;
; eg. 1 or 1.2
;
Function GetDecNumber$(state.TState)
GetFollowing(state, DIGITS$ + ".")
state okType = TOK_DEC_NUMBER
End Function

; GetBinNumber
;
; eg. %1100
;
Function GetBinNumber$(state.TState)
GetFollowing(state, BINDIGITS$)
state okType = TOK_BIN_NUMBER
End Function

; GetHexNumber
;
; eg. $abC1
;
Function GetHexNumber$(state.TState)
GetFollowing(state, HEXDIGITS$)
If state ok$ = "$"
state okType = TOK_OPERATOR
Else
state okType = TOK_HEX_NUMBER
EndIf
End Function

; GetEOL
;
; Get end of line
Function GetEOL(state.TState)
state ok$ = statech$
GetChar(state)
state okType = TOK_EOL
End Function

; GetWord
;
; eg. myVar_2 or WaitKey
;
Function GetWord(state.TState)
GetFollowing(state, ALPHANUM$ + "_")
If state ok$ = "Or" Or state ok$ = "And" Or state ok$ = "Xor" Then
state okType = TOK_OPERATOR
Else
state okType = TOK_WORD
EndIf
End Function

; GetOperator
;
; eg. , .  + - = > < <> etc.
;
Function GetOperator(state.TState)
t$ = statech$
GetChar(state)

r$ = t$

Select t$
Case ">"
t$ = statech$
Select t$
Case "="
r$ = ">="
GetChar(state)
Case "<"
r$ = "><"
GetChar(state)
Default
r$ = ">"
End Select
Case "<"
t$ = statech$
Select t$
Case "="
r$ = "<="
GetChar(state)
Case ">"
r$ = "<>"
GetChar(state)
Default
r$ = "<"
End Select
Case "="
t$ = statech$
Select t$
Case ">"
r$ = "=>"
GetChar(state)
Case "<"
r$ = "=<"
GetChar(state)
Default
r$ = "="
End Select
End Select

state ok$ = r$
state okType = TOK_OPERATOR
End Function

; GetComment
;
; eg. ; a comment
;
Function GetComment(state.TState)
tok$ = statech$
GetChar(state)
While statech$ <> Chr$(LF)
If statech$ = "" Exit
If statech$ <> Chr$(CR)
tok$ = tok$ + statech$
EndIf
GetChar(state)
Wend
state ok$ = tok$
state okType = TOK_COMMENT
End Function

; GetString
;
; eg. "a string"
;
Function GetString(state.TState)
tok$ = ""
GetChar(state)
While statech$ <> QUOTE$ And statech$ <> ""
tok$ = tok$ + statech$
GetChar(state)
Wend

If statech$ <> ""
state ok$ = QUOTE$ + tok$ + QUOTE$
GetChar(state)
EndIf

state okType = TOK_STRING
End Function

; GetToken
;
; Use first character to determine type of token and then
; read appropriate token using the corresponding Get function
;
Function GetToken(state.TState)
SkipSpace(state)

ch$ = statech$

If ch$ = "" Then
state ok$ = ""
state okType = TOK_EOF
Return
EndIf

If Instr(DIGITS$, ch$) <> 0 Then
GetDecNumber$(state)
ElseIf Instr(ALPHA$, ch$) <> 0 Then
GetWord(state)
ElseIf ch$ = ";" Then
GetComment(state)
ElseIf ch$ = QUOTE$ Then
GetString(state)
ElseIf ch$ = "%" Then
GetBinNumber(state)
ElseIf ch$ = "$" Then
GetHexNumber(state)
ElseIf Instr(DELIM$, ch$) <> 0 Then
GetOperator(state)
ElseIf ch$ = Chr$(LF)
GetEOL(state)
Else
Error("Unrecognised character " + ch$ + "(" + Asc(ch$) + ") in file", state)
EndIf

; DebugLog Str$(statelineNum) + ": " + state ok$

End Function

; Example usage
;
;
;
;inFile$  = Input$("Input file: ")
;
;OpenState(inFile$)
;state.TState = Last TState
;While state ok$ <> ""
; If state okType <> TOK_EOL Then
; Print state ok$
; EndIf
; GetToken(state)
;Wend
;CloseState()
;
;Print "Press a key"
;WaitKey
;
;End


Comments :


_33(Posted 1+ years ago)

 My practicle adaptation of previous code.func_tokenizer.bb :
; Author: Steve Hill
; Date: 2003-08-27 13:41:23
; Title: Blitz Lexical Analyser
; Description: Blitz functions to split a Blitz source file into tokens
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; OpenState(file$,mode) - creates a new TState
; GetToken(state)       - read the next token
; CloseState()          - destroys and closes the current state
;
; The current token is available in state ok$
;
; Versions
; 0.1 Initial version                                   27 Aug 2003
; 0.2 Added >< => =<                                  29 Aug 2003
; 0.3   Added "mode" select                               16 Apr 2007 by _33
;       REPLACED "tok" by "token" (this is not Tokamak)   16 Apr 2007 by _33
; 0.4   Removed PRINTs and ENDs which made this unusable  16 Apr 2007 by _33
;       Added some error management "ok flag"             16 Apr 2007 by _33
;
; FUTURE RELEASE: -Add function to return FLOAT, INT, and STRING
;                 -More cleanup, optimize, expand on concept
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

Const  SPACE        = 32
Const  TAB = 9
Const  CR = 13
Const  LF           = 10

Global WHITE_SPACE$   = Chr$(SPACE) + Chr$(TAB) + Chr$(CR)
Const  ALPHA$         = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
Const  DIGITS$        = "0123456789"
Const  HEXDIGITS$     = "0123456789abcdefABCDEF"
Const  BINDIGITS$     = "01"
Const  DELIM$  = "^*+-~<>/#%.$()[],=:?!"
Global QUOTE$         = Chr$(34)
Global ALPHANUM$      = ALPHA$ + DIGITS$

; TState
;
; Keeps track of the current file, look-ahead character and token.
; Can be used as a stack for include files.
;
Type TState
Field file
Field mode
Field ch$
Field token$
Field tokenType
Field lineNum
; Field charNum
Field content$     ;v0.3
Field content_len% ;v0.3
    Field ch_pos       ;v0.3
End Type


Const TOKEN_WORD = 0
Const TOKEN_OPERATOR = 1
Const TOKEN_COMMENT = 2
Const TOKEN_DEC_NUMBER = 3
Const TOKEN_HEX_NUMBER = 4
Const TOKEN_BIN_NUMBER = 5
Const TOKEN_STRING = 6
Const TOKEN_EOL = 7
Const TOKEN_EOF = 8
Const TOKEN_UNKNOWN = 9

; TDescriptor
;
; Describes a function: name, return type and parameter type list.
; Assigned unique id for each functions ... its "pointer"
;
Type TDescriptor
Field name$
Field typ$
Field params$
Field id
End Type

; OpenState
;
; Open the file initialise the fields
; 1 = file, 2 = inline tokenize
Function OpenState(fi$,mode)
   state.TState = New TState
   statemode = mode
   statech_pos = 0
   If statemode = 1 Then
      statefile = ReadFile(fi$)
      If statefile = 0 Then
         Return 0
      EndIf
   ElseIf statemode = 2
      statecontent$ = fi$
   EndIf
   statelineNum = 1
   statech$ = ""
   state oken$ = ""
   Return GetToken(state)
End Function

; CloseState
;
; Close current file, pop state
;
Function CloseState()
   For state.TState = Each TState
      If statemode = 1 Then CloseFile(statefile)
   Next
   Delete Each TState
End Function

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; LEXICAL FUNCTIONS
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

; GetChar
;
; Read a character from the current file into the state
;
Function GetChar(state.TState)
   If statech$ = Chr$(LF) Then statelineNum = stateLineNum + 1 : statech_pos = 0

   If statemode = 1 Then
      If Eof(statefile) Then statech$ = "" : Return
      statech$ = Chr$(ReadByte(statefile))
   Else
      statech_pos = statech_pos + 1
      statech$ = Mid$(statecontent$,statech_pos,1)
   EndIf
;morel
End Function

; SkipSpace
;
; Skip white space
;
Function SkipSpace(state.TState)
While Instr(WHITE_SPACE$, statech$) <> 0
GetChar(state)
If statech$ = "" Return
Wend
End Function

; GetFollowing
;
; Generic token reader, reads while characters are
; those in pat.  Places token in state.
;
Function GetFollowing(state.TState, pat$)
token$ = statech$
GetChar(state)
While Instr(pat$, statech$) <> 0
If statech$ = "" Exit
token$ = token$ + statech$
GetChar(state)
Wend
state oken$ = token$
End Function

; GetDecNumber
;
; eg. 1 or 1.2
Function GetDecNumber$(state.TState)
GetFollowing(state, DIGITS$ + ".")
state okenType = TOKEN_DEC_NUMBER
End Function

; GetBinNumber
;
; eg. %1100
Function GetBinNumber$(state.TState)
GetFollowing(state, BINDIGITS$)
state okenType = TOKEN_BIN_NUMBER
End Function

; GetHexNumber
;
; eg. $abC1
Function GetHexNumber$(state.TState)
GetFollowing(state, HEXDIGITS$)
If state oken$ = "$"
state okenType = TOKEN_OPERATOR
Else
state okenType = TOKEN_HEX_NUMBER
EndIf
End Function

; GetEOL
;
; Get end of line
Function GetEOL(state.TState)
state oken$ = statech$
GetChar(state)
state okenType = TOKEN_EOL
End Function

; GetWord
;
; eg. myVar_2 or WaitKey
Function GetWord(state.TState)
GetFollowing(state, ALPHANUM$ + "_")
If state oken$ = "Or" Or state oken$ = "And" Or state oken$ = "Xor" Then
state okenType = TOKEN_OPERATOR
Else
state okenType = TOKEN_WORD
EndIf
End Function

Function GetOperator(state.TState)
t$ = statech$
GetChar(state)

r$ = t$

Select t$
Case ">"
t$ = statech$
Select t$
Case "="
r$ = ">="
GetChar(state)
Case "<"
r$ = "><"
GetChar(state)
Default
r$ = ">"
End Select
Case "<"
t$ = statech$
Select t$
Case "="
r$ = "<="
GetChar(state)
Case ">"
r$ = "<>"
GetChar(state)
Default
r$ = "<"
End Select
Case "="
t$ = statech$
Select t$
Case ">"
r$ = "=>"
GetChar(state)
Case "<"
r$ = "=<"
GetChar(state)
Default
r$ = "="
End Select
End Select

state oken$ = r$
state okenType = TOKEN_OPERATOR
End Function

Function GetComment(state.TState)
   token$ = statech$
   GetChar(state)
   While statech$ <> Chr$(LF)
      If statech$ = "" Exit
      If statech$ <> Chr$(CR)
         token$ = token$ + statech$
      EndIf
      GetChar(state)
   Wend
   state oken$ = token$
   state okenType = TOKEN_COMMENT
End Function

Function GetString(state.TState)
   state oken$ = ""
   GetChar(state)
   While statech$ <> QUOTE$ And statech$ <> ""
      state oken$ = state oken$ + statech$
      GetChar(state)
   Wend
   If statech$ <> ""
      GetChar(state)
   EndIf
   state okenType = TOKEN_STRING
End Function

Function GetUnrecognized(state.TState)
   state oken$ = statech$
   state okenType = TOKEN_UNKNOWN
   GetChar(state)
End Function


; GetToken
;
; Use first character to determine type of token and then
; read appropriate token using the corresponding Get function
Function GetToken%(state.TState)
   SkipSpace(state)
   ch$ = statech$
   If ch$ = "" Then
      state oken$ = ""
      state okenType = TOKEN_EOF
      Return 1
   EndIf
   If Instr(DIGITS$, ch$) <> 0 Then
      GetDecNumber$(state)
   ElseIf Instr(ALPHA$, ch$) <> 0 Then
      GetWord(state)
   ElseIf ch$ = ";" Then
      GetComment(state)
   ElseIf ch$ = QUOTE$ Then
      GetString(state)
   ElseIf ch$ = "%" Then
      GetBinNumber(state)
   ElseIf ch$ = "$" Then
      GetHexNumber(state)
   ElseIf Instr(DELIM$, ch$) <> 0 Then
      GetOperator(state)
   ElseIf ch$ = Chr$(LF)
      GetEOL(state)
   Else
      GetUnrecognized(state)
   EndIf
   Return 1
End Function
TEST:Graphics 1024,768,32,2
Include "func_tokenizer.bb"

mode = Input$("type 1 to read from file, 2 to enter your own test: ")
inFile$  = Input$("Input file: ")
While inFile$ <>""
   Print ""
   Print "Parsing " + inFile$

   ok = OpenState(inFile$, mode)
   state.TState = Last TState
    While ok
      Color 0,255,0
      Select state okenType
      Case TOKEN_WORD       Write "W" : Color 0,255,255 : Write state oken$
      Case TOKEN_OPERATOR   Write "O" : Color 0,255,255 : Write state oken$
      Case TOKEN_COMMENT    Write "C" : Color 0,255,255 : Write state oken$
      Case TOKEN_DEC_NUMBER Write "D" : Color 0,255,255 : Write state oken$
      Case TOKEN_HEX_NUMBER Write "H" : Color 0,255,255 : Write state oken$
      Case TOKEN_BIN_NUMBER Write "B" : Color 0,255,255 : Write state oken$
      Case TOKEN_STRING     Write "S" : Color 255,255,255 : Write state oken$
      Case TOKEN_EOL        Color 255,255,0 : Write "EOL" : Print ""
      Case TOKEN_EOF        Color 255,0,255 : Write "EOF" : Print "" : ok = 0
      Case TOKEN_UNKNOWN    Color 255,0,0 : Write "U"
      End Select
      If ok Then ok = GetToken(state)
      If KeyHit(1) Then ok = 0
   Wend
   CloseState()
   Color 255,255,255
   inFile$  = Input$("Input file: ")
Wend
End