#!/usr/bin/perl
# catnav bot by WikiPedia:User:下一次登录
# Disclaimer: No warranty granted, use at your own risk!
# call requirements
use Getopt::Std;
use LWP::Simple;
use LWP::UserAgent;
use HTTP::Request;
use HTTP::Response;
use HTTP::Cookies;
#subroutine
#parameters
local $username="xcnbot"; #input your username here, only English names are tested.
local $password="******"; #input your password here
local $WIKI_PATH="zh.wikipedia.org";
local $WIKI_PAGE;
### Login to wiki
# Set up connection data
my $browser=LWP::UserAgent->new();
my @ns_headers = (
'User-Agent' => 'Xcnbot1.0 by 下一次登录', #Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7) Gecko/20041107 Firefox/1.0',
'Accept' => 'image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, image/png, */*',
'Accept-Charset' => 'iso-8859-1,*,utf-8',
'Accept-Language' => 'en-US',
);
# Hold cookies
$browser->cookie_jar( {} );
{# Login
# Make login request
$response=$browser->post("http://".$WIKI_PATH."/w/index.php?title=Special:Userlogin&action=submitlogin",
@ns_headers, Content=>[wpName=>$username,wpPassword=>$password,wpRemember=>"1",wpLoginAttempt=>"Log in"]);
# After logging in, we should be redirected to another page.
# If we aren't, something is wrong.
if($response->code!=302) { #cannot login
print "We weren't able to login.\n\n";
close(DEBUG);
exit 1;
}
print "Logged in \n";
}
# Trivial variables
local $content; #target page content
local $editcontent; #target edit content
local $neweditcontent; #modified target edit content
local $content1; #parent category content
local $editToken;
local $editTime;
local $beforeCatnav;
local @Catnav;
local $afterCatnav;
local $bError;
local $vLine;
local $nDepth;
local $nCatnav;
local @vCat;
local $bChange;
# Set parameters
my $vNameU="%E5%BE%8C%E7%87%95%E5%B9%B4%E8%99%9F";
# get a list of categories from special:allpages
my $article_count=0; #number of articles in allpages
my @article_name; #the characters of the article names for log
my @article_unicode; #the unicode article names for connection
my $last_string; #the unicode of the last article in the last run (init="%21")
my $article_line; #one article line in allpage content
my $article_ID;
open LOG1, ">>log.txt";
while(1) { #process
#read last_string.txt and start allpages from that article
open FILE, "<last_string.txt";
$last_string="";
while (<FILE>) {
$last_string.=$_;
}
if(1) { #process allpages contents and make a list
#go to allpages and get the contents
$URL="http://".$WIKI_PATH."/w/index.php?title=Special:Allpages&from=".$last_string."&namespace=14";
$response=$browser->get($URL, @ns_headers);
$filename1=$response->as_string;
$article_count=0; #reset the article count
{ #truncate the contents
#find the start point and extract the content
$filestartstr="<table style=\"background: inherit;\" border=\"0\" width=\"100%\">";
$filestart=index($filename1, $filestartstr);
$filename1=substr($filename1, $filestart+60);
#find the end point and cut
$fileendstr="<div class=\"printfooter\">";
$fileend=index($filename1, $fileendstr);
$filename1=substr($filename1, 0, $fileend);
} #truncate the contents
{ #find all the cat names
#extract a line (between<td> </td>)and leave rest to $filename1
$filestartstr="<td>";
$fileendstr="</td>";
$filestart=index($filename1, $filestartstr)+4;
$fileend=index($filename1, $fileendstr);
$article_line=substr($filename1, $filestart, $fileend-$filestart);
$filename1=substr($filename1, $fileend+5);
while($article_count<=202) { #if there is article names in allpage contents
#process $article_line
#extract the unicode name
$filestartstr="<a href=\"/wiki/";
$filestart=index($article_line, $filestartstr)+15+9;
$article_line=substr($article_line, $filestart);
$fileendstr="\"";
$fileend=index($article_line, $fileendstr);
$article_unicode[$article_count]=substr($article_line, 0, $fileend);
$article_line=substr($article_line, $fileend+1);
#extract the character name
$filestartstr="title=\"";
$filestart=index($article_line, $filestartstr)+7;
$article_line=substr($article_line, $filestart);
$fileendstr="\"";
$fileend=index($article_line, $fileendstr);
$article_name[$article_count]=substr($article_line, 0, $fileend);
$article_count+=1;
#extract a line (between<td> </td>)and leave rest to
$filestartstr="<td>";
$fileendstr="</td>";
$filestart=index($filename1, $filestartstr)+4;
$fileend=index($filename1, $fileendstr);
$article_line=substr($filename1, $filestart, $fileend-$filestart);
$filename1=substr($filename1, $fileend+5);
} #while
} #find all the cat names
} #process allpages contents and make a list
$article_ID=0;
while($article_ID<$article_count) { #go through all the pages and process
$vNameU=$article_unicode[$article_ID];
# Connect to root cat
$WIKI_PAGE=$vNameU;
$URL="http://".$WIKI_PATH."/wiki/Category:".$WIKI_PAGE;
$response=$browser->get($URL, @ns_headers);
$content=$response->as_string;
print "\n";
print $URL;
print LOG1 "\n";
print LOG1 $URL;
$URL="http://".$WIKI_PATH."/w/index.php?title=Category:".$WIKI_PAGE."&action=edit";
$response=$browser->get($URL, @ns_headers);
$editcontent=$response->as_string;
# Get EditToken
($editToken) = ( $editcontent =~ m/value\=\"([0-9a-f\\]*)\" name\=\"wpEditToken\"/ );
($editTime) = ( $editcontent =~ m/value\=\"([0-9a-f]*)\" name\=\"wpEdittime\"/ );
$filestartstr="<textarea tabindex='1' accesskey=\",\" name=\"wpTextbox1\" id=\"wpTextbox1\" rows='25'";
$fileendstr="</textarea>";
$filestart= index($editcontent, $filestartstr);
$filestart+=92;
$fileend= index($editcontent, $fileendstr);
$editcontent=substr($editcontent, $filestart, $fileend-$filestart);
$afterCatnav=$editcontent;
print "\nC ";
print LOG1 "\nC ";
$bChange=0;
#organize edit content for catnav
$filestartstr="{{catnav";
$neweditcontent=$editcontent;
while(index($neweditcontent, $filestartstr)>=0) {
substr($neweditcontent, index($neweditcontent, $filestartstr), length($filestartstr) ) ="{{Catnav";
}
#find all {{catnav
$filestartstr="{{Catnav";
$nCatnav=0;
while(index($neweditcontent, $filestartstr)>=0)
{ #find all {{catnav
$filestartstr="{{Catnav";
$neweditcontent=substr($neweditcontent, index($neweditcontent, $filestartstr));
$filestartstr="}}";
$Catnav[$nCatnav]=substr($neweditcontent, 0, index($neweditcontent, $filestartstr)+2);
$neweditcontent=substr($neweditcontent, index($Catnav, $filestartstr)+2);
$nCatnav+=1;
$filestartstr="{{Catnav";
} #find all {{catnav
print $nCatnav;
print "cn(s) ";
print LOG1 $nCatnav;
print LOG1 "cn(s) ";
# process catnav
$nCatnav=0;
$filestartstr="title=\"catnav\" style=";
while(index($content, $filestartstr)>=0)
{ # process catnav
{# process catnav
#get a line
$filestartstr="title=\"catnav\" style=";
$filestart=index($content, $filestartstr);
$content=substr($content, $filestart+21);
$fileendstr="</div>";
$fileend=index($content, $fileendstr);
$vLine=substr($content, 0, $fileend);
$content=substr($content, $fileend+5);
#get all links
$filestartstr="...";
if(index($vLine, $filestartstr)>=0)
{
$filestart=index($vLine, $filestartstr);
$vLine=substr($vLine, $filestart+3);
}
$nDepth=0;
$filestartstr="<a href=\"/wiki/Category:";
while(index($vLine, $filestartstr)>=0)
{ #while there is a cat link
$filestartstr="<a href=\"/wiki/Category:";
$filestart=index($vLine, $filestartstr)+24;
$vLine=substr($vLine, $filestart);
$filestartstr="\" title=\"Category:";
$filestart=index($vLine, $filestartstr);
$vCat[$nDepth]=substr($vLine, 0, $filestart);
$vLine=substr($vLine, $filestart+18);
$nDepth+=1;
} #while there is a cat link
}# process catnav
#check parent cats
$bError=0;
local $Page;
$Page=$vNameU;
while($nDepth>0)
{
$nDepth=$nDepth-1;
if($bError==0)
{
$WIKI_PAGE=$Page;
$URL="http://".$WIKI_PATH."/wiki/Category:".$WIKI_PAGE;
$response=$browser->get($URL, @ns_headers);
$content1=$response->as_string;
$filestartstr="<div id=\"catlinks\"><p class='catlinks'><a href=\"/wiki/Special:Categories\" title=";
$content1=substr($content1, index($content1, $filestartstr));
$filestartstr="</span></p></div>";
$content1=substr($content1, 0, index($content1, $filestartstr));
$Page=$vCat[$nDepth];
my $Page1;
$Page1="Category:".$vCat[$nDepth]."\" title";
if(index($content1, $Page1)<0)
{
$bError=1;
print "Err ";
print LOG1 "Err ";
}
}
}
#delete catnav
if($bError>0)
{ #delete catnav
my $cnstring="{{catnav|";
substr($editcontent, index($editcontent, $cnstring), length($cnstring) ) ="{{Catnav|";
substr($editcontent, index($editcontent, $Catnav[$nCatnav]), length($Catnav[$nCatnav])+1 ) ="";
$bChange=1;
} #delete catnav
$filestartstr="title=\"catnav\" style=";
$nCatnav+=1;
} # process catnav
#update
if($bChange>0)
{
{ #check for illegal characters
my $special_char;
$special_char="""; #"
while(index($neweditcontent, $special_char)>=0) {
substr($neweditcontent, index($neweditcontent, $special_char), length($special_char) ) ="\"";
}
$special_char="<"; #<
while(index($neweditcontent, $special_char)>=0) {
substr($neweditcontent, index($neweditcontent, $special_char), length($special_char) ) ="<";
}
$special_char=">"; #>
while(index($neweditcontent, $special_char)>=0) {
substr($neweditcontent, index($neweditcontent, $special_char), length($special_char) ) =">";
}
$special_char="&"; #&
while(index($neweditcontent, $special_char)>=0) {
substr($neweditcontent, index($neweditcontent, $special_char), length($special_char) ) ="&";
}
$special_char=" "; #
while(index($neweditcontent, $special_char)>=0) {
substr($neweditcontent, index($neweditcontent, $special_char), length($special_char) ) =" ";
}
} #check for illegal characters
$WIKI_PAGE=$vNameU;
$URL="http://".$WIKI_PATH."/w/index.php?title=Category:".$WIKI_PAGE."&action=edit";
if(1) {#Update
$response=$browser ->
post($URL, @ns_headers, Content_Type=>'form-data',Content=>
[ wpTextbox1 => $editcontent,
wpSummary => "Testing: Wrong catnav deleted.",
wpSave => "Save page",
wpSection => "",
wpEdittime => $editTime,
wpEditToken => $editToken,
wpMinoredit => "1",
]);
print "Changed.";
print LOG1 "Changed.";
}
}
$article_ID+=1;
} #while ID<count
if(1) { #record last string.txt
open INPUT, ">last_string.txt";
print INPUT $article_unicode[$article_count-1];
close INPUT;
}
} #while whole