Module:WikitextParser

From Wikipedia, the free encyclopedia
-- Module:WikitextParser is a general-purpose wikitext parser
-- Documentation and master version: https://en.wikipedia.org/wiki/Module:WikitextParser
-- Authors: User:Sophivorus, User:Certes, User:Aidan9382, et al.
-- License: CC-BY-SA-4.0
local WikitextParser = {}

-- Helper function to escape a string for use in regexes
local function escapeString( str )
	return str:gsub( '[%^%$%(%)%.%[%]%*%+%-%?%%]', '%%%0' )
end

-- Get the lead section from the given wikitext
-- The lead section is any content before the first section title.
-- @param wikitext Required. Wikitext to parse.
-- @return Wikitext of the lead section. May be empty if the lead section is empty.
function WikitextParser.getLead( wikitext )
	wikitext = '\n' .. wikitext
	wikitext = wikitext:gsub( '\n==.*', '' )
	wikitext = mw.text.trim( wikitext )
	return wikitext
end

-- Get the sections from the given wikitext
-- This method doesn't get the lead section, use getLead for that
-- @param wikitext Required. Wikitext to parse.
-- @return Map from section title to section content
function WikitextParser.getSections( wikitext )
	local sections = {}
	wikitext = '\n' .. wikitext .. '\n=='
	for title in wikitext:gmatch( '\n==+ *([^=]+) *==+' ) do
		local section = wikitext:match( '\n==+ *' .. escapeString( title ) .. ' *==+(.-)\n==' )
		section = mw.text.trim( section )
		sections[ title ] = section
	end
	return sections
end

-- Get a section from the given wikitext (including any subsections)
-- If the given section title appears more than once, only the section of the first instance will be returned
-- @param wikitext Required. Wikitext to parse.
-- @param title Required. Title of the section
-- @return Wikitext of the section, or nil if it isn't found. May be empty if the section is empty or contains only subsections.
function WikitextParser.getSection( wikitext, title )
	title = mw.text.trim( title )
	title = escapeString( title )
	wikitext = '\n' .. wikitext .. '\n'
	local level, wikitext = wikitext:match( '\n(==+) *' .. title .. ' *==.-\n(.*)' )
	if wikitext then
		local nextSection = '\n==' .. string.rep( '=?', #level - 2 ) .. '[^=].*'
		wikitext = wikitext:gsub( nextSection, '' ) -- remove later sections at this level or higher
		wikitext = mw.text.trim( wikitext )
		return wikitext
	end
end

-- Get the content of a <section> tag from the given wikitext.
-- We can't use getTags because both opening and closing <section> tags are self-closing tags.
-- @param wikitext Required. Wikitext to parse.
-- @param name Required. Name of the <section> tag
-- @return Content of the <section> tag, or nil if it isn't found. May be empty if the section tag is empty.
function WikitextParser.getSectionTag( wikitext, name )
	name = mw.text.trim( name )
	name = escapeString( name )
	wikitext = wikitext:match( '< *section +begin *= *["\']? *' .. name .. ' *["\']? */>(.-)< *section +end= *["\']? *'.. name ..' *["\']? */>' )
	if wikitext then
		return mw.text.trim( wikitext )
	end
end

-- Get the lists from the given wikitext.
-- @param wikitext Required. Wikitext to parse.
-- @return Sequence of lists.
function WikitextParser.getLists( wikitext )
	local lists = {}
	wikitext = '\n' .. wikitext .. '\n\n'
	for list in wikitext:gmatch( '\n([*#].-)\n[^*#]' ) do
		table.insert( lists, list )
	end
	return lists
end

-- Get the paragraphs from the given wikitext.
-- @param wikitext Required. Wikitext to parse.
-- @return Sequence of paragraphs.
function WikitextParser.getParagraphs( wikitext )
	local paragraphs = {}

	-- Remove non-paragraphs
	wikitext = '\n' .. wikitext .. '\n'
	wikitext = wikitext:gsub( '\n[*#][^\n]*', '' ) -- remove lists
	wikitext = wikitext:gsub( '\n%[%b[]%]\n', '' ) -- remove files and categories
	wikitext = wikitext:gsub( '\n%b{} *\n', '\n%0\n' ) -- add spacing between tables and block templates
	wikitext = wikitext:gsub( '\n%b{} *\n', '\n' ) -- remove tables and block templates
	wikitext = wikitext:gsub( '\n==+[^=]+==+ *\n', '\n' ) -- remove section titles
	wikitext = mw.text.trim( wikitext )

	for paragraph in mw.text.gsplit( wikitext, '\n\n+' ) do
		if mw.text.trim( paragraph ) ~= '' then
			table.insert( paragraphs, paragraph )
		end
	end
	return paragraphs
end

-- Get the templates from the given wikitext.
-- @param wikitext Required. Wikitext to parse.
-- @return Sequence of templates.
function WikitextParser.getTemplates( wikitext )
	local templates = {}
	for template in wikitext:gmatch( '{%b{}}' ) do
		if wikitext:sub( 1, 3 ) ~= '{{#' then -- skip parser functions like #if
			table.insert( templates, template )
		end
	end
	return templates
end

-- Get the requested template from the given wikitext.
-- If the template appears more than once, only the first instance will be returned
-- @param wikitext Required. Wikitext to parse.
-- @param name Name of the template to get
-- @return Wikitext of the template, or nil if it wasn't found
function WikitextParser.getTemplate( wikitext, name )
	local templates = WikitextParser.getTemplates( wikitext )
	local lang = mw.language.getContentLanguage()
	for _, template in pairs( templates ) do
		local templateName = template:match( '^{{ *([^}|\n]+)' )
		if lang:ucfirst( templateName ) == lang:ucfirst( name ) then
			return template
		end
	end
end

-- Get the parameters from the given template.
-- @param wikitext Required. Template wikitext to parse.
-- @return Map from parameter name to parameter value
function WikitextParser.getParameters( template )
	local parameters = {}
	local params = template:match( '{{[^|}]-|(.*)}}' )
	if params then
		-- Temporarily replace pipes in subtemplates and links to avoid chaos
		for subtemplate in params:gmatch( '{%b{}}' ) do
			params = params:gsub( escapeString( subtemplate ), subtemplate:gsub( '.', { ['%']='%%', ['|']="@@:@@", ['=']='@@_@@' } ) )
		end
		for link in params:gmatch( '[%b[]]' ) do
			params = params:gsub( escapeString( link ), link:gsub( '.', { ['%']='%%', ['|']='@@:@@', ['=']='@@_@@' } ) )
		end
		local count = 0
		local parts, name, value
		for param in mw.text.gsplit( params, '|' ) do
			parts = mw.text.split( param, '=' )
			name = mw.text.trim( parts[1] )
			if #parts == 1 then
				value = name
				count = count + 1
				name = count
			else
				value = mw.text.trim( table.concat( parts, '=', 2 ) )
			end
			value = value:gsub( '@@_@@', '=' )
			value = value:gsub( '@@:@@', '|' )
			parameters[ name ] = value
		end
	end
	return parameters
end

-- Get the tags from the given wikitext.
-- @param wikitext Required. Wikitext to parse.
-- @return Sequence of tags.
function WikitextParser.getTags( wikitext )
	local tags = {}
	local tag, tagName, tagEnd
	for tagStart, tagOpen in wikitext:gmatch( '()(<[^/].->)' ) do
		tagName = tagOpen:match( '< ?(.-)[ >]' )

		-- If we're in a self-closing tag, like <ref name="foo" />, <references/>, <br/>, <br>, <hr>, etc.
		if tagOpen:match( '<.-/>' ) or tagName == 'br' or tagName == 'hr' then
			tag = tagOpen

		-- If we're in a tag that may contain others like it, like <div> or <span>
		elseif tagName == 'div' or tagName == 'span' then
			local position = tagStart + #tagOpen - 1
			local depth = 1
			while depth > 0 do
				tagEnd = wikitext:match( '</ ?' .. tagName .. ' ?>()', position )
				if tagEnd then
					tagEnd = tagEnd - 1
				else
					break -- unclosed tag
				end 
				position = wikitext:match( '()< ?' .. tagName .. '[ >]', position + 1 )
				if not position then
					position = tagEnd + 1
				end
				if position > tagEnd then
					depth = depth - 1
				else
					depth = depth + 1
				end
			end
			tag = wikitext:sub( tagStart, tagEnd )

		-- Else we're in tag that shouldn't contain others like it, like <math> or <strong>
		else
			tagEnd = wikitext:match( '</ ?' .. tagName .. ' ?>()', tagStart ) - 1
			tag = wikitext:sub( tagStart, tagEnd )
		end
		table.insert( tags, tag )
	end
	return tags
end

-- Get the <gallery> tags from the given wikitext.
-- @param wikitext Required. Wikitext to parse.
-- @return Sequence of gallery tags.
function WikitextParser.getGalleries( wikitext )
	local galleries = {}
	local tags = WikitextParser.getTags( wikitext )
	for _, tag in pairs( tags ) do
		local tagName = tag:match( '< ?(.-)[ >]' )
		if tagName == 'gallery' then
			table.insert( galleries, tag )
		end
	end
	return galleries
end

-- Get the <ref> tags from the given wikitext.
-- @param wikitext Required. Wikitext to parse.
-- @return Sequence of ref tags.
function WikitextParser.getReferences( wikitext )
	local references = {}
	local tags = WikitextParser.getTags( wikitext )
	for _, tag in pairs( tags ) do
		local tagName = tag:match( '< ?(.-)[ >]' )
		if tagName == 'ref' then
			table.insert( references, tag )
		end
	end
	return references
end

-- Get the tables from the given wikitext.
-- @param wikitext Required. Wikitext to parse.
-- @return Sequence of tables.
function WikitextParser.getTables( wikitext )
	local tables = {}
	wikitext = '\n' .. wikitext
	for t in wikitext:gmatch( '\n%b{}' ) do
		if t:sub( 1, 3 ) == '\n{|' then
			t = mw.text.trim( t ) -- exclude the leading newline
			table.insert( tables, t )
		end
	end
	return tables
end

-- Get the id from the given table wikitext
-- @param t Required. Wikitext of the table to parse.
-- @return Id of the table or nil if not found
function WikitextParser.getTableId( t )
	return string.match( t, '^{|[^\n]-id *= *["\']?([^"\'\n]+)["\']?[^\n]*\n' )
end

-- Get a table by id from the given wikitext
-- @param wikitext Required. Wikitext to parse.
-- @param id Required. Id of the table
-- @return Wikitext of the table or nil if not found
function WikitextParser.getTableById( wikitext, id )
	local tables = WikitextParser.getTables( wikitext )
	for _, t in ipairs( tables ) do
		if id == WikitextParser.getTableId( t ) then
			return t
		end
	end
end

-- Get the data from the given table wikitext
-- @param tableWikitext Required. Wikitext of the table to parse.
-- @return Table data
-- @todo Test and make more robust
function WikitextParser.getTableData( tableWikitext )
	local tableData = {}
	tableWikitext = mw.text.trim( tableWikitext );
	tableWikitext = string.gsub( tableWikitext, '^{|.-\n', '' ) -- remove the header
	tableWikitext = string.gsub( tableWikitext, '\n|}$', '' ) -- remove the footer
	tableWikitext = string.gsub( tableWikitext, '^|%+.-\n', '' ) -- remove any caption
	tableWikitext = string.gsub( tableWikitext, '|%-.-\n', '|-\n' ) -- remove any row attributes
	tableWikitext = string.gsub( tableWikitext, '^|%-\n', '' ) -- remove any leading empty row
	tableWikitext = string.gsub( tableWikitext, '\n|%-$', '' ) -- remove any trailing empty row
	for rowWikitext in mw.text.gsplit( tableWikitext, '|-', true ) do
		local rowData = {}
		rowWikitext = string.gsub( rowWikitext, '||', '\n|' )
		rowWikitext = string.gsub( rowWikitext, '!!', '\n|' )
		rowWikitext = string.gsub( rowWikitext, '\n!', '\n|' )
		rowWikitext = string.gsub( rowWikitext, '^!', '\n|' )
		rowWikitext = string.gsub( rowWikitext, '^\n|', '' )
		for cellWikitext in mw.text.gsplit( rowWikitext, '\n|' ) do
			cellWikitext = mw.text.trim( cellWikitext )
			table.insert( rowData, cellWikitext )
		end
		table.insert( tableData, rowData )
	end
	return tableData
end

-- Get the internal links from the given wikitext (includes category and file links).
-- @param wikitext Required. Wikitext to parse.
-- @return Sequence of internal links.
function WikitextParser.getLinks( wikitext )
	local links = {}
	for link in wikitext:gmatch( '%[%b[]%]' ) do
		table.insert( links, link )
	end
	return links
end

-- Get the file links from the given wikitext.
-- @param wikitext Required. Wikitext to parse.
-- @return Sequence of file links.
function WikitextParser.getFiles( wikitext )
	local files = {}
	local links = WikitextParser.getLinks( wikitext )
	for _, link in pairs( links ) do
		local namespace = link:match( '%[%[ ?(.+) ?:.+%]%]' )
		if namespace and mw.site.namespaces[ namespace ] and mw.site.namespaces[ namespace ].canonicalName == 'File' then
			table.insert( files, link )
		end
	end
	return files
end

-- Get the category links from the given wikitext.
-- @param wikitext Required. Wikitext to parse.
-- @return Sequence of category links.
function WikitextParser.getCategories( wikitext )
	local categories = {}
	local links = WikitextParser.getLinks( wikitext )
	for _, link in pairs( links ) do
		local namespace = link:match( '%[%[ ?(.+) ?:.+%]%]' )
		if namespace and mw.site.namespaces[ namespace ] and mw.site.namespaces[ namespace ].canonicalName == 'Category' then
			table.insert( categories, link )
		end
	end
	return categories
end

-- Get the external links from the given wikitext.
-- @param wikitext Required. Wikitext to parse.
-- @return Sequence of external links.
function WikitextParser.getExternalLinks( wikitext )
	local links = {}
	for link in wikitext:gmatch( '%b[]' ) do
		if link:match( '^%[//' ) or link:match( '^%[https?://' ) then
			table.insert( links, link )
		end
	end
	return links
end

return WikitextParser