#------------------------------------------------------------------------------ # TODO: # - Fix HTML parsing, no regexen! See wpEditToken+wpEdittime. Unfortunately, # REXML is slow. module MediaWikiBot require 'cgi' require 'erb' require 'http-access2' class WikiBot include ERB::Util # for url_encode() def initialize(wiki) @wiki = wiki @client = HTTPAccess2::Client.new() @client.set_cookie_store("cookie.dat") end #------------------------------------------------------------------------------ # FIXME: This should really be in HTTPAccess2::Client def post_form (url, post_vars) body = "" post_vars.keys.each do |var| body += "&" unless body == "" body += url_encode(var) + "=" + url_encode(post_vars[var]) end result = @client.post(url, body, [[ "Content-Type", "application/x-www-form-urlencoded" ]]) return result.content end #------------------------------------------------------------------------------ # HTTP authentication def set_basic_auth(user_id, passwd) @client.set_basic_auth(@wiki, user_id, passwd) end #------------------------------------------------------------------------------ # MediaWiki stuff: Could be useful for other wiki bots def url_no_redirect (title) return @wiki + "index.php?title=" + url_encode(title) + "&redirect=no" end def url_page (title) return @wiki + "index.php?title=" + url_encode(title) end def url_raw (title) return @wiki + "index.php?title=" + url_encode(title) + "&action=raw" end def url_what_links_here (title) return @wiki + "index.php?title=Special:Whatlinkshere&target=" + url_encode(title) end def url_delete (title) return @wiki + "index.php?title=" + url_encode(title) + "&action=delete" end def url_submitlogin return @wiki + "index.php?title=Special:Userlogin&action=submitlogin" end def url_protect(title) return @wiki + "index.php?title=" + url_encode(title) + "&action=protect" end def login(wiki_name, wiki_password) post_form(url_submitlogin(), { "wpName" => wiki_name, "wpPassword" => wiki_password, "wpLoginattempt" => "" }) end def is_redirect? (title) return get_raw(title) =~ /^#REDIRECT[: ]/ end def is_not_linked? (title) return get_what_links_here(title).size() == 0 end def get_raw (title) return @client.get_content(url_raw(title)) end def get_allpages # FIXME: There seems to be no MediaWiki API? Let's scrape it up from # HTML for now. allpages = [] allpages_page = @client.get_content(@wiki + "Special:Allpages") allpages_page.scan(/title="(.*?)"/) do |m| title = CGI.unescapeHTML(m[0]) allpages.push(title) end return allpages end def get_redirect (title) return get_raw(title).scan(/^#REDIRECT[: ]\[\[(.*)\]\]/)[0][0] # first match end def get_what_links_here (title) # FIXME: There seems to be no MediaWiki API? Let's scrape it up from # HTML for now. what_links_here = [] what_links_here_page = @client.get_content(url_what_links_here(title)) what_links_here_page.scan(/
  • reason, "wpConfirm" => "1",})) post_form(url_delete(title), { "wpReason" => reason, "wpConfirm" => "1", "wpEditToken" => token }) end def replace (replace_where, replace_what, replace_with, reason) # " " could be "_" # FIXME: Shouldn't be done here replace_what = replace_what.gsub(/ /, "[ _]") $stderr.print("Replacing /", replace_what, "/ with '", replace_with, "' in '", replace_where, "'.\n") replace_what_re = Regexp.new(replace_what) before = get_raw(replace_where) after = before.gsub(replace_what_re, replace_with) edit(replace_where, after, reason) end def url_edit(title) return @wiki + "index.php?title=" + url_encode(title)+ "&action=edit" end def url_edit_submit(title) return @wiki + "index.php?title=" + url_encode(title)+ "&action=submit" end def edit (title, body, summary) $stderr.print("Submitting '", title, "'.\n") token_page = @client.get_content(url_edit(title)) while ! token_page.match(/value="(.*?)" name="wpEdittime" /) # FIXME: This workaround loop really fucking sucks. token_page = @client.get_content(url_edit(title)) end time = token_page.scan(/value="(.*?)" name="wpEdittime" /)[0][0] token = get_token(token_page) post_form(url_edit_submit(title), { "wpTextbox1" => body, "wpSummary" => summary, "wpEditToken" => token, "wpEdittime" => time, "wpSave" => "save" }) end def get_what_uses_template(title) return get_what_links_here(title) end def parse_template(title, template) template_re = Regexp.new("\\\{\\\{" + template + "(.*?)\\\}\\\}") fields = {} fields_string = get_raw(title).scan(template_re)[0][0] inlink = 0 field = "" fields_string += "|" unless fields_string.match(/\|$/) fields_string.split(//).each do |c| if c == "|" && inlink == 0 (key, value) = field.split(/=/) fields[key] = value field = "" else inlink += 1 if c == "[" inlink -= 1 if c == "]" field += c end end return fields end def get_category_articles(category) # FIXME: There seems to be no MediaWiki API? Let's scrape it up from # HTML for now. articles = [] category_page = @client.get_content(@wiki + url_encode(category)) category_page.scan(/
  • (.*?)<\/div>/)[0][0] categories_html.scan(/title="(.*?)"/) do |m| category = CGI.unescapeHTML(m[0]) categories.push(category) end return categories[1..-1] end def is_in_category?(title, category) if is_redirect?(title) return false end get_categories(title).member?(category) end def is_protected?(title) return @client.get_content(url_page(title)) =~ /action=unprotect/ end def protect(title, reason) token = get_token(@client.get_content(url_protect(title))) post_form(url_protect(title), { "wpReasonProtect" => reason, "wpConfirmProtectB" => "confirm", "wpEditToken" => token }) end end # class WikiBot end # module MediaWikiBot