🔤 Rust Lexical Structure
📋 Token Types Quick Reference
Keywords
fn, let, mut, if, else, match
Reserved language constructs
Identifiers
variable_name, FunctionName
Names for variables, functions, types
Literals
42, "hello", 'c', true
Direct values in source code
Operators
+, -, *, ==, &&, |
Mathematical and logical operations
Punctuation
{ } ( ) [ ] ; , . ::
Structure and grouping symbols
Comments
// line /* block */ /// doc
Code documentation and notes
🏗️ What is Lexical Structure?
Lexical structure defines how Rust source code is broken down into individual tokens - the smallest meaningful units of the language. Understanding lexical structure is fundamental to reading and writing Rust code effectively.
Token Classification
Input Text
Lexical Analysis
Tokens
Parser
AST
Identifier Grammar
XID_Start
XID_Continue*
|
_
XID_Continue+
⚙️ How Tokenization Works
Source Code
fn main() {
let x = 42;
println!("Hello");
}
Tokens Produced
KEYWORD(fn)
IDENT(main)
PUNCT(()
PUNCT())
PUNCT({)
KEYWORD(let)
IDENT(x)
PUNCT(=)
LITERAL(42)
PUNCT(;)
IDENT(println!)
PUNCT(()
LITERAL("Hello")
PUNCT())
PUNCT(;)
PUNCT(})
🏷️ Identifier Rules
✅ Valid Identifiers
// Basic identifiers
let variable_name = 42;
let CamelCase = "hello";
let _private = true;
let __internal = 0;
// Unicode identifiers
let 变量 = "Chinese";
let переменная = "Russian";
let μ = 3.14159;
// Raw identifiers
let r#fn = "function";
let r#match = "pattern";
❌ Invalid Identifiers
// Cannot start with numbers
// let 2fast = "error";
// Cannot use keywords without r#
// let fn = "error";
// let match = "error";
// Cannot use reserved symbols
// let @ = "error";
// let # = "error";
📝 Literal Types
Integer Literals
42, 0xFF, 0o755, 0b1010
Decimal, hex, octal, binary
Float Literals
3.14, 2.5e10, 1.0f32
Scientific notation supported
String Literals
"hello", r"raw", br"bytes"
UTF-8 strings with escapes
Character Literals
'a', '\n', '\u{41}', b'A'
Single Unicode code points
Boolean Literals
true, false
Built-in boolean values
🗂️ Token Categories
🤖 For AI Coding Agents
{
"rust_lexical_structure": {
"tokenization_rules": {
"whitespace": {
"ignored": ["space", "tab", "newline", "carriage_return"],
"significant": "Only for token separation"
},
"identifiers": {
"pattern": "(XID_Start | _) (XID_Continue)*",
"raw_form": "r#identifier_name",
"unicode_support": true,
"case_sensitive": true
},
"keywords": {
"strict": [
"as", "break", "const", "continue", "crate", "else", "enum",
"extern", "false", "fn", "for", "if", "impl", "in", "let",
"loop", "match", "mod", "move", "mut", "pub", "ref", "return",
"self", "Self", "static", "struct", "super", "trait", "true",
"type", "unsafe", "use", "where", "while"
],
"weak": ["union", "dyn"],
"reserved": ["abstract", "become", "box", "do", "final", "macro", "override", "priv", "typeof", "unsized", "virtual", "yield"]
},
"literals": {
"integer": {
"decimal": "123, 123_456",
"hexadecimal": "0xFF, 0xff",
"octal": "0o755",
"binary": "0b1010_1111",
"suffixes": ["u8", "u16", "u32", "u64", "u128", "usize", "i8", "i16", "i32", "i64", "i128", "isize"]
},
"float": {
"format": "123.45, 1.23e10, 1.23E-10",
"suffixes": ["f32", "f64"]
},
"string": {
"basic": "\"hello world\"",
"raw": "r\"no\\escapes\"",
"raw_with_hashes": "r#\"can contain \"quotes\"\"#",
"byte_string": "b\"bytes only\"",
"raw_byte_string": "br\"raw bytes\""
},
"character": {
"basic": "'a', 'Z', '5'",
"escaped": "'\\n', '\\t', '\\''",
"unicode": "'α', '\\u{1F980}'",
"byte": "b'A'"
},
"boolean": ["true", "false"]
},
"operators": {
"arithmetic": ["+", "-", "*", "/", "%"],
"comparison": ["==", "!=", "<", ">", "<=", ">="],
"logical": ["&&", "||", "!"],
"bitwise": ["&", "|", "^", "<<", ">>"],
"assignment": ["=", "+=", "-=", "*=", "/=", "%=", "&=", "|=", "^=", "<<=", ">>="],
"special": ["->", "=>", "::", ".", "..", "...", "?", "@"]
},
"punctuation": {
"brackets": ["(", ")", "[", "]", "{", "}"],
"separators": [";", ",", ":"],
"reference": ["&", "*"]
},
"comments": {
"line": "// comment text",
"block": "/* comment text */",
"doc_outer": "/// documentation",
"doc_inner": "//! inner doc",
"doc_block_outer": "/** documentation */",
"doc_block_inner": "/*! inner doc */"
}
},
"parsing_context": {
"token_precedence": "Operators have precedence rules for parsing",
"contextual_keywords": "Some keywords are only reserved in specific contexts",
"path_resolution": ":: is used for absolute and relative path resolution",
"macro_invocation": "! suffix indicates macro calls"
}
}
}
🔍 Advanced Tokenization
Raw Identifiers
Use r# prefix to use keywords as identifiers:
// Useful for FFI or when interfacing with other languages
let r#type = "string";
let r#match = "pattern";
let r#fn = "function";
// But not needed for contextual keywords
let union = "allowed"; // 'union' is only reserved in specific contexts
Unicode Support
Rust fully supports Unicode in identifiers and strings:
// Variables can use any Unicode script
let 变量 = "Chinese variable";
let переменная = "Russian variable";
let μ = 3.14159; // Greek letter mu
let 🦀 = "crab"; // Even emojis work (though not recommended)
// String literals support full Unicode
let emoji = "🦀🚀✨";
let math = "α² + β² = γ²";
Macro Invocation Syntax
Macros are distinguished by the ! suffix:
println!("This is a macro"); // Macro call
vec![1, 2, 3] // Macro call
format!("Hello {}", name) // Macro call
// vs regular function calls
std::process::exit(0); // Function call
String::from("hello"); // Function call
Lifetime Syntax
Lifetimes use apostrophe syntax but are not character literals:
// These are lifetime parameters, not character literals
fn longest<'a>(x: &'a str, y: &'a str) -> &'a str {
if x.len() > y.len() { x } else { y }
}
// Named lifetimes for clarity
'outer: loop {
'inner: loop {
break 'outer; // Break the outer loop
}
}
💬 Comment Types