smr/src/lua/parser_imageboard.lua

--[[
A parser that approximates 8chan's markup:

Surround text with double single-quotes(') to make text italic
Surround text with triple single-quotes to make text bold
Surround text with underscores(_) to make it underlined
Surround text with double asterisks(*) to make it spoilered
Surround text with tildes(~) to make it strike through
Begin a line with a greater-than followed by a a space to make it
>greentext
Begin a line with a less-than followed by a space to make it
<pinktext
Surround text with forum-style [spoiler] and [/spoiler] tags as a second way to spoiler
Surround text with forum-style [code] and [/code] tags to make it preformatted and monospace

]]

local lpeg = require("lpeg")
lpeg.locale(lpeg)
local V,P,C,S,B,Cs = lpeg.V,lpeg.P,lpeg.C,lpeg.S,lpeg.B,lpeg.Cs
--Characters to escape in the body text
local escapes = {
	["&"] = "&amp;",
	["<"] = "&lt;",
	[">"] = "&gt;",
}
local esctbl = {}
for char,_ in pairs(escapes) do
	table.insert(esctbl,char)
end
local escapematch = string.format("([%s])",table.concat(esctbl))
local function sanitize_item(capture)
	return escapes[capture] or capture
end
local function sanitize(text)
	local ret,_ = string.gsub(text,escapematch,sanitize_item)
	return ret
end

--Grammar
local space = S" \t\r"^0
local special = P{
	P"**" + P"''" + P"'''" +
	P"__" + P"==" + P"~~" +
	P"\n>" + P"\n<" + P"\n" +
	P"[code]" + P"[spoiler]"
}
local word = Cs((1 - special)^1) * space / sanitize

--Generates a pattern that formats text inside matching 'seq' tags with format
--ex wrap("^^",[[<sup>%s</sup>]])
--will wrap text "5^^3^^" as "5<sup>3</sup>"
--The third argument is nessessary to stop exponential backtracking. This removes
--a DOS vulnerability: If tags are nested really deep, the parser can lock up,
--potentially locking up all processes.
--[[
local function wrap(seq,format,V"sup")
	return P(seq) * Cs(((V"marked" + word + P"\n"))^1) * P(seq) / function(a)
		return string.format(format,a)
	end
end
]]
local function wrap(seq,format,s)
	return P(seq) * Cs((((V"marked" - s) + word + P"\n"))^1) * P(seq) / function(a)
		return string.format(format,a)
	end
end

--Generates a pattern that formats text inside opening and closing "name" tags
--with a format, BB forum style
local function tag(name,format)
	local start_tag = P(string.format("[%s]",name))
	local end_tag = P(string.format("[/%s]",name))
	return start_tag * Cs(((1 - end_tag))^1) * end_tag / function(a)
		return string.format(format,sanitize(a))
	end
end

local grammar = P{
	"chunk";
	--regular
	spoiler = wrap("**",[[<span class="spoiler">%s</span>]],V"spoiler"),
	spoiler2 = tag("spoiler",[[<span class="spoiler2">%s</span>]]),
	italic = wrap("''",[[<i>%s</i>]], V"italic"),
	bold = wrap("'''",[[<b>%s</b>]], V"bold"),
	underline = wrap("__",[[<u>%s</u>]], V"underline"),
	heading = wrap("==",[[<h2>%s</h2>]], V"heading"),
	strike = wrap("~~",[[<s>%s</s>]], V"strike"),
	code = tag("code",[[<pre><code>%s</code></pre>]]),
	greentext = P">" * (B"\n>" + B">") * Cs((V"marked" + word)^0) / function(a)
		return string.format([[<span class="greentext">&gt;%s</span>]],a)
	end,
	pinktext = P"<" * (B"\n<" + B"<") * Cs((V"marked" + word)^0) / function(a)
		return string.format([[<span class="pinktext">&lt;%s</span>]],a)
	end,
	marked = V"spoiler" + V"bold" + V"italic" + V"underline" + V"heading" + V"strike" + V"spoiler2" + V"code",
	plainline = (V"marked" + word)^0,
	line = Cs(V"greentext" + V"pinktext" + V"plainline" + P"") * P"\n" / function(a)
		print("Found line:",a)
		if a == "\r" then
			return "<br/>"
		else
			return string.format("<p>%s</p>",a)
		end
	end,
	ending = C(P(1)^0) / function(a) print("failed with ending:", a) return sanitize(a) end,
	chunk = V"line"^0 * V"plainline" * V"ending"
}

--A chunk of text that the parser chokes on:
local s = [=[
Minor update to the search function, also added a search bar to the front page.

Characters in '''bold''' are literal characters, things in ''<angle brackets and italics>'' are substitutions.


The search utility searches for stories on the site. At it's most simple, it searches stories based on tags, but it can also filter stories based on the fields: '''title''', '''author''', '''date''', and '''hits'''. In general, the syntax for search is {'''+-'''} ''<field>'' ''<operator>'' ''<value>''

The first '''+''' or '''-''' specifies weather to include or exclude results based on this search, the ''<field>'' specifies what field to search for (or search based on tag if this is missing), and ''<operator>'' specifies how to search.

For title and author, the only allowed operator is '''='''. This operator will search for ''<value>'' appearing anywhere in the field, case insensitive. For '''hits''' and '''time''', the allowed operators are '''>''','''<''','''>=''', '''<=''','''=''', which searches for greater than, less than, greater than or equal to, less than or equal to, and strictly equal to respectively. '''tag''' does not need a ''<field>'' or ''<operator>'', and only allows exact matches. As a quirk of this system, it is impossible to search for the tags "author", "title", "hits" or "date".

Examples:
[code]
+author=admin -meta
[/code]
Will return all stories by the users "admin" and "b'''admin'''ton_enthusiast" that do not include the "meta" tag.

[code]
+hits>20 -date>=1609459201
[/code]
Will return all stories with more than 20 hits that were posted before January 1, 2021 (unix timestamp 1609459201).

While the date field is a little hard to use for humans, it may be useful for robots. 
]=]
--print(table.concat({grammar:match(s .. "\n")}," "))
return function(text)
	return table.concat({grammar:match(text .. "\n")}," ")
end
Started testing 2020-12-23 07:02:02 +01:00			`--[[`
			`A parser that approximates 8chan's markup:`

			`Surround text with double single-quotes(') to make text italic`
			`Surround text with triple single-quotes to make text bold`
			`Surround text with underscores(_) to make it underlined`
			`Surround text with double asterisks(*) to make it spoilered`
			`Surround text with tildes(~) to make it strike through`
			`Begin a line with a greater-than followed by a a space to make it`
			`>greentext`
			`Begin a line with a less-than followed by a space to make it`
			`<pinktext`
			`Surround text with forum-style [spoiler] and [/spoiler] tags as a second way to spoiler`
			`Surround text with forum-style [code] and [/code] tags to make it preformatted and monospace`

			`]]`

Inital commit 2020-05-16 01:10:11 +02:00			`local lpeg = require("lpeg")`
			`lpeg.locale(lpeg)`
			`local V,P,C,S,B,Cs = lpeg.V,lpeg.P,lpeg.C,lpeg.S,lpeg.B,lpeg.Cs`
			`--Characters to escape in the body text`
			`local escapes = {`
			`["&"] = "&",`
			`["<"] = "<",`
			`[">"] = ">",`
			`}`
			`local esctbl = {}`
			`for char,_ in pairs(escapes) do`
			`table.insert(esctbl,char)`
			`end`
			`local escapematch = string.format("([%s])",table.concat(esctbl))`
			`local function sanitize_item(capture)`
			`return escapes[capture] or capture`
			`end`
			`local function sanitize(text)`
			`local ret,_ = string.gsub(text,escapematch,sanitize_item)`
			`return ret`
			`end`

Move sql files into their own directory Moved all the of the sql out of the init file and into it's own directory. 2020-05-17 18:05:00 +02:00			`--Grammar`
Inital commit 2020-05-16 01:10:11 +02:00			`local space = S" \t\r"^0`
			`local special = P{`
			`P"**" + P"''" + P"'''" +`
			`P"__" + P"==" + P"~~" +`
			`P"\n>" + P"\n<" + P"\n" +`
			`P"[code]" + P"[spoiler]"`
			`}`
			`local word = Cs((1 - special)^1) * space / sanitize`

			`--Generates a pattern that formats text inside matching 'seq' tags with format`
			`--ex wrap("^^",[[<sup>%s</sup>]])`
Move sql files into their own directory Moved all the of the sql out of the init file and into it's own directory. 2020-05-17 18:05:00 +02:00			`--will wrap text "5^^3^^" as "5<sup>3</sup>"`
Fix the problem that caused the last crash. In certain cases, the imageboard parser will have exponential time complexity. While this patch fixes that particular problem, the correct solution is to implement a timeout for particularly gnarly parts of the code. 2020-12-29 21:19:05 +01:00			`--The third argument is nessessary to stop exponential backtracking. This removes`
			`--a DOS vulnerability: If tags are nested really deep, the parser can lock up,`
			`--potentially locking up all processes.`
			`--[[`
			`local function wrap(seq,format,V"sup")`
Added preview Added a preview button the the paste page that allows users to preview their paste before submitting. Various additions to the imageboard parser to make it more robust and accept more inputs. 2020-08-24 23:38:24 +02:00			`return P(seq) * Cs(((V"marked" + word + P"\n"))^1) * P(seq) / function(a)`
			`return string.format(format,a)`
Inital commit 2020-05-16 01:10:11 +02:00			`end`
			`end`
Fix the problem that caused the last crash. In certain cases, the imageboard parser will have exponential time complexity. While this patch fixes that particular problem, the correct solution is to implement a timeout for particularly gnarly parts of the code. 2020-12-29 21:19:05 +01:00			`]]`
			`local function wrap(seq,format,s)`
			`return P(seq) * Cs((((V"marked" - s) + word + P"\n"))^1) * P(seq) / function(a)`
			`return string.format(format,a)`
			`end`
			`end`
Inital commit 2020-05-16 01:10:11 +02:00
Move sql files into their own directory Moved all the of the sql out of the init file and into it's own directory. 2020-05-17 18:05:00 +02:00			`--Generates a pattern that formats text inside opening and closing "name" tags`
Inital commit 2020-05-16 01:10:11 +02:00			`--with a format, BB forum style`
			`local function tag(name,format)`
			`local start_tag = P(string.format("[%s]",name))`
			`local end_tag = P(string.format("[/%s]",name))`
Added preview Added a preview button the the paste page that allows users to preview their paste before submitting. Various additions to the imageboard parser to make it more robust and accept more inputs. 2020-08-24 23:38:24 +02:00			`return start_tag * Cs(((1 - end_tag))^1) * end_tag / function(a)`
Inital commit 2020-05-16 01:10:11 +02:00			`return string.format(format,sanitize(a))`
			`end`
			`end`

Move sql files into their own directory Moved all the of the sql out of the init file and into it's own directory. 2020-05-17 18:05:00 +02:00			`local grammar = P{`
Inital commit 2020-05-16 01:10:11 +02:00			`"chunk";`
			`--regular`
Fix the problem that caused the last crash. In certain cases, the imageboard parser will have exponential time complexity. While this patch fixes that particular problem, the correct solution is to implement a timeout for particularly gnarly parts of the code. 2020-12-29 21:19:05 +01:00			`spoiler = wrap("**",[[<span class="spoiler">%s</span>]],V"spoiler"),`
Inital commit 2020-05-16 01:10:11 +02:00			`spoiler2 = tag("spoiler",[[<span class="spoiler2">%s</span>]]),`
Fix the problem that caused the last crash. In certain cases, the imageboard parser will have exponential time complexity. While this patch fixes that particular problem, the correct solution is to implement a timeout for particularly gnarly parts of the code. 2020-12-29 21:19:05 +01:00			`italic = wrap("''",[[<i>%s</i>]], V"italic"),`
			`bold = wrap("'''",[[<b>%s</b>]], V"bold"),`
			`underline = wrap("__",[[<u>%s</u>]], V"underline"),`
			`heading = wrap("==",[[<h2>%s</h2>]], V"heading"),`
			`strike = wrap("~~",[[<s>%s</s>]], V"strike"),`
Inital commit 2020-05-16 01:10:11 +02:00			`code = tag("code",[[<pre><code>%s</code></pre>]]),`
Add comments Add comments for each paste, Also add a "Download TXT" button. 2020-08-13 19:59:33 +02:00			`greentext = P">" * (B"\n>" + B">") * Cs((V"marked" + word)^0) / function(a)`
Inital commit 2020-05-16 01:10:11 +02:00			`return string.format([[<span class="greentext">>%s</span>]],a)`
			`end,`
Add comments Add comments for each paste, Also add a "Download TXT" button. 2020-08-13 19:59:33 +02:00			`pinktext = P"<" * (B"\n<" + B"<") * Cs((V"marked" + word)^0) / function(a)`
Inital commit 2020-05-16 01:10:11 +02:00			`return string.format([[<span class="pinktext"><%s</span>]],a)`
			`end,`
			`marked = V"spoiler" + V"bold" + V"italic" + V"underline" + V"heading" + V"strike" + V"spoiler2" + V"code",`
			`plainline = (V"marked" + word)^0,`
			`line = Cs(V"greentext" + V"pinktext" + V"plainline" + P"") * P"\n" / function(a)`
Fix the problem that caused the last crash. In certain cases, the imageboard parser will have exponential time complexity. While this patch fixes that particular problem, the correct solution is to implement a timeout for particularly gnarly parts of the code. 2020-12-29 21:19:05 +01:00			`print("Found line:",a)`
Various updates + Added tags to stories + Added a view counter to stories + Added a very basic search by tag function ~ Modified imageboard parser so that blank lines are preserved ~ Raised the minimum cache time from 2 seconds to 20 seconds 2020-10-11 01:28:39 +02:00			`if a == "\r" then`
			`return "<br/>"`
			`else`
			`return string.format("<p>%s</p>",a)`
			`end`
Inital commit 2020-05-16 01:10:11 +02:00			`end,`
Added preview Added a preview button the the paste page that allows users to preview their paste before submitting. Various additions to the imageboard parser to make it more robust and accept more inputs. 2020-08-24 23:38:24 +02:00			`ending = C(P(1)^0) / function(a) print("failed with ending:", a) return sanitize(a) end,`
Inital commit 2020-05-16 01:10:11 +02:00			`chunk = V"line"^0 * V"plainline" * V"ending"`
			`}`

Fix the problem that caused the last crash. In certain cases, the imageboard parser will have exponential time complexity. While this patch fixes that particular problem, the correct solution is to implement a timeout for particularly gnarly parts of the code. 2020-12-29 21:19:05 +01:00			`--A chunk of text that the parser chokes on:`
			`local s = [=[`
			`Minor update to the search function, also added a search bar to the front page.`

			`Characters in '''bold''' are literal characters, things in ''<angle brackets and italics>'' are substitutions.`


			`The search utility searches for stories on the site. At it's most simple, it searches stories based on tags, but it can also filter stories based on the fields: '''title''', '''author''', '''date''', and '''hits'''. In general, the syntax for search is {'''+-'''} ''<field>'' ''<operator>'' ''<value>''`

			`The first '''+''' or '''-''' specifies weather to include or exclude results based on this search, the ''<field>'' specifies what field to search for (or search based on tag if this is missing), and ''<operator>'' specifies how to search.`

			For title and author, the only allowed operator is '''='''. This operator will search for ''<value>'' appearing anywhere in the field, case insensitive. For '''hits''' and '''time''', the allowed operators are '''>''','''<''','''>=''', '''<=''','''=''', which searches for greater than, less than, greater than or equal to, less than or equal to, and strictly equal to respectively. '''tag''' does not need a ''<field>'' or ''<operator>'', and only allows exact matches. As a quirk of this system, it is impossible to search for the tags "author", "title", "hits" or "date".

			`Examples:`
			`[code]`
			`+author=admin -meta`
			`[/code]`
			`Will return all stories by the users "admin" and "b'''admin'''ton_enthusiast" that do not include the "meta" tag.`

			`[code]`
			`+hits>20 -date>=1609459201`
			`[/code]`
			`Will return all stories with more than 20 hits that were posted before January 1, 2021 (unix timestamp 1609459201).`

			`While the date field is a little hard to use for humans, it may be useful for robots.`
			`]=]`
			`--print(table.concat({grammar:match(s .. "\n")}," "))`
Minor bugfix 2020-05-17 19:17:15 +02:00			`return function(text)`
			`return table.concat({grammar:match(text .. "\n")}," ")`
Move sql files into their own directory Moved all the of the sql out of the init file and into it's own directory. 2020-05-17 18:05:00 +02:00			`end`