ribbit/test/tokenizer.test.ts

323 lines
11 KiB
TypeScript
Raw Permalink Normal View History

import { ribbit, getWindow } from './setup';
import { InlineTokenizer, type InlineToken } from '../src/ts/tokenizer';
import { MarkdownSerializer, type SerializerTagDef } from '../src/ts/serializer';
// Set up DOM globals before any tests run
getWindow();
const boldDef = {
delimiter: '**',
htmlTag: 'strong',
recursive: true,
precedence: 40,
};
const italicDef = {
delimiter: '*',
htmlTag: 'em',
recursive: true,
precedence: 50,
};
const strikeDef = {
delimiter: '~~',
htmlTag: 'del',
recursive: true,
precedence: 45,
};
const codeDef = {
delimiter: '`',
htmlTag: 'code',
recursive: false,
precedence: 10,
};
const tokenizer = new InlineTokenizer([boldDef, italicDef, strikeDef, codeDef]);
function roles(tokens: InlineToken[]): string[] {
return tokens.map(token => token.role);
}
function values(tokens: InlineToken[]): string[] {
return tokens.map(token => token.value);
}
describe('InlineTokenizer', () => {
describe('plain text', () => {
it('produces a single text token', () => {
const tokens = tokenizer.tokenize('hello world');
expect(roles(tokens)).toEqual(['text']);
expect(values(tokens)).toEqual(['hello world']);
});
});
describe('bold', () => {
it('tokenizes **bold**', () => {
const tokens = tokenizer.tokenize('**bold**');
expect(roles(tokens)).toEqual(['open', 'text', 'close']);
expect(tokens[0].delimiter).toBe('**');
expect(tokens[1].value).toBe('bold');
});
it('tokenizes text **bold** text', () => {
const tokens = tokenizer.tokenize('hello **bold** end');
expect(roles(tokens)).toEqual(['text', 'open', 'text', 'close', 'text']);
});
});
describe('italic', () => {
it('tokenizes *italic*', () => {
const tokens = tokenizer.tokenize('*italic*');
expect(roles(tokens)).toEqual(['open', 'text', 'close']);
expect(tokens[0].delimiter).toBe('*');
});
});
describe('strikethrough', () => {
it('tokenizes ~~struck~~', () => {
const tokens = tokenizer.tokenize('~~struck~~');
expect(roles(tokens)).toEqual(['open', 'text', 'close']);
expect(tokens[0].delimiter).toBe('~~');
});
});
describe('code spans', () => {
it('tokenizes `code`', () => {
const tokens = tokenizer.tokenize('`code`');
expect(roles(tokens)).toEqual(['code']);
expect(tokens[0].content).toBe('code');
});
it('does not parse delimiters inside code', () => {
const tokens = tokenizer.tokenize('`**not bold**`');
expect(roles(tokens)).toEqual(['code']);
expect(tokens[0].content).toBe('**not bold**');
});
});
describe('backslash escapes', () => {
it('\\* becomes literal *', () => {
const tokens = tokenizer.tokenize('\\*hello');
expect(roles(tokens)).toEqual(['text']);
expect(tokens[0].value).toBe('*hello');
});
it('\\\\ becomes literal \\', () => {
const tokens = tokenizer.tokenize('\\\\');
expect(roles(tokens)).toEqual(['text']);
expect(tokens[0].value).toBe('\\');
});
it('\\n at end of line is a hard break', () => {
const tokens = tokenizer.tokenize('hello\\\nworld');
expect(roles(tokens)).toEqual(['text', 'break', 'text']);
});
});
describe('hard line breaks', () => {
it('two trailing spaces before newline', () => {
const tokens = tokenizer.tokenize('hello \nworld');
expect(roles(tokens)).toEqual(['text', 'break', 'text']);
});
it('single space does not break', () => {
const tokens = tokenizer.tokenize('hello \nworld');
const breakTokens = tokens.filter(token => token.role === 'break');
expect(breakTokens.length).toBe(0);
});
});
describe('entity resolution', () => {
it('& becomes &', () => {
const tokens = tokenizer.tokenize('a & b');
expect(tokens[0].value).toBe('a & b');
});
it('{ becomes {', () => {
const tokens = tokenizer.tokenize('{');
expect(tokens[0].value).toBe('{');
});
it('{ becomes {', () => {
const tokens = tokenizer.tokenize('{');
expect(tokens[0].value).toBe('{');
});
});
describe('links', () => {
it('tokenizes [text](url)', () => {
const tokens = tokenizer.tokenize('[click](http://x)');
expect(roles(tokens)).toEqual(['link']);
expect(tokens[0].href).toBe('http://x');
expect(tokens[0].value).toBe('click');
});
it('tokenizes [text](url "title")', () => {
const tokens = tokenizer.tokenize('[click](http://x "My Title")');
expect(tokens[0].title).toBe('My Title');
});
it('disallows [ in link text', () => {
const tokens = tokenizer.tokenize('[outer [inner](b)](a)');
// Should not match as a single link
const linkTokens = tokens.filter(token => token.role === 'link');
expect(linkTokens.length).toBeLessThanOrEqual(1);
});
});
describe('autolinks', () => {
it('tokenizes <url>', () => {
const tokens = tokenizer.tokenize('<https://example.com>');
expect(roles(tokens)).toEqual(['autolink']);
expect(tokens[0].href).toBe('https://example.com');
});
it('tokenizes bare URL', () => {
const tokens = tokenizer.tokenize('visit https://example.com today');
expect(tokens.some(token => token.role === 'autolink')).toBe(true);
});
});
describe('HTML passthrough', () => {
it('tokenizes HTML tags', () => {
const tokens = tokenizer.tokenize('a <span>b</span> c');
const htmlTokens = tokens.filter(token => token.role === 'html');
expect(htmlTokens.length).toBe(2);
expect(htmlTokens[0].value).toBe('<span>');
expect(htmlTokens[1].value).toBe('</span>');
});
});
describe('flanking rules', () => {
it('mid-word * is not a delimiter', () => {
const tokens = tokenizer.tokenize('2*3*4');
expect(roles(tokens)).toEqual(['text']);
});
it('* at word boundary is a delimiter', () => {
const tokens = tokenizer.tokenize('*hello*');
expect(roles(tokens)).toEqual(['open', 'text', 'close']);
});
});
describe('nested delimiters', () => {
it('bold inside italic', () => {
const tokens = tokenizer.tokenize('*hello **world***');
const openTokens = tokens.filter(token => token.role === 'open');
expect(openTokens.length).toBe(2);
});
});
});
describe('MarkdownSerializer', () => {
const tagMap = new Map<string, SerializerTagDef>([
['STRONG', { delimiter: '**' }],
['B', { delimiter: '**' }],
['EM', { delimiter: '*' }],
['I', { delimiter: '*' }],
['DEL', { delimiter: '~~' }],
['CODE', {
serialize: (element) => '`' + (element.textContent || '') + '`',
}],
['A', {
serialize: (element, children) => {
const href = element.getAttribute('href') || '';
const title = element.getAttribute('title');
const titlePart = title ? ` "${title}"` : '';
return '[' + children() + '](' + href + titlePart + ')';
},
}],
['BR', {
serialize: () => ' \n',
}],
]);
const delimiterChars = new Set(['*', '`', '~']);
const serializer = new MarkdownSerializer(tagMap, delimiterChars);
it('serializes plain text', () => {
const div = document.createElement('div');
div.textContent = 'hello world';
expect(serializer.serialize(div)).toBe('hello world');
});
it('serializes bold', () => {
const div = document.createElement('div');
div.innerHTML = '<strong>bold</strong>';
expect(serializer.serialize(div)).toBe('**bold**');
});
it('serializes italic', () => {
const div = document.createElement('div');
div.innerHTML = '<em>italic</em>';
expect(serializer.serialize(div)).toBe('*italic*');
});
it('escapes * in text nodes', () => {
const div = document.createElement('div');
div.textContent = 'hello * world';
expect(serializer.serialize(div)).toBe('hello \\* world');
});
it('escapes _ in text nodes', () => {
const div = document.createElement('div');
div.textContent = 'hello_world';
expect(serializer.serialize(div)).toBe('hello\\_world');
});
it('escapes \\ in text nodes', () => {
const div = document.createElement('div');
div.textContent = 'back\\slash';
expect(serializer.serialize(div)).toBe('back\\\\slash');
});
it('escapes < before letters', () => {
const div = document.createElement('div');
div.textContent = 'a <b> c';
expect(serializer.serialize(div)).toBe('a \\<b> c');
});
it('does not escape < before non-letters', () => {
const div = document.createElement('div');
div.textContent = '1 < 2';
expect(serializer.serialize(div)).toBe('1 < 2');
});
it('does not escape * inside delimiters', () => {
const div = document.createElement('div');
div.innerHTML = '<strong>bold</strong>';
const result = serializer.serialize(div);
// The ** are delimiter tokens, not escaped
expect(result).toBe('**bold**');
expect(result).not.toContain('\\*');
});
it('escapes * in text adjacent to delimiters', () => {
const div = document.createElement('div');
div.innerHTML = '<strong>bold</strong> * text';
const result = serializer.serialize(div);
expect(result).toContain('\\*');
});
it('serializes link', () => {
const div = document.createElement('div');
div.innerHTML = '<a href="http://x">click</a>';
expect(serializer.serialize(div)).toBe('[click](http://x)');
});
it('serializes link with title', () => {
const div = document.createElement('div');
div.innerHTML = '<a href="http://x" title="T">click</a>';
expect(serializer.serialize(div)).toBe('[click](http://x "T")');
});
it('serializes code', () => {
const div = document.createElement('div');
div.innerHTML = '<code>x</code>';
expect(serializer.serialize(div)).toBe('`x`');
});
it('serializes hard break', () => {
const div = document.createElement('div');
div.innerHTML = 'hello<br>world';
expect(serializer.serialize(div)).toBe('hello \nworld');
});
});