User:Tom.Bot/Task6 code
Source
public string ProcessArticle(string ArticleText, string ArticleTitle, int wikiNamespace, out string Summary, out bool Skip)
{
// global switches //////////////////////////////////////////////////////////
bool TomBot = true;
bool SaveSkipSummaries = false;
bool SkipIfBlacklisted = true;
bool ManuallyCheckPagesWithoutAGoodInfobox = false; // usually it's an {{infobox person}} or {{infobox scientist}}
bool ManuallyPlaceAuthorityAtEndOfPage = false; // aid for pages w/o a {{DEFAULTSORT}} nor cats; manual use only
bool LiveDebug = false;
bool SandboxDebug = false; // auto-detect
Skip = false;
// global-use vars //////////////////////////////////////////////////////////
Summary = "";
// preliminary exceptions/error checking ////////////////////////////////////
if (ArticleTitle == "User:Tom.Reding/sandbox") SandboxDebug = true;
List<string> BlackList = new List<string>(new string[] {
""
});
if (!Skip && BlackList.Contains(ArticleTitle))
{
if (SkipIfBlacklisted)
{
Summary = "Blacklisted article";
Skip = true;
}
}
// check for appropriate (bio) infoboxes (now done via PetScan for all templates in [[Category:People and person infobox templates]], per BRFA)
string PeopleTemplates_Regex = @"\{\{\s*(?:[Ii]nfobox[ _]+actor[ _]+voice|[Ii]nfobox[ _]+Actor|[Ii]nfobox[ _]+actor|[Ii]nfobox[ _]+Actress|[Ii]nfobox[ _]+actress|[Ii]nfobox[ _]+adult[ _]+biography|[Ii]nfobox[ _]+adult[ _]+female|[Ii]nfobox[ _]+adult[ _]+male|[Ii]nfobox[ _]+Biography|[Ii]nfobox[ _]+biography|[Ii]nfobox[ _]+bio|[Ii]nfobox[ _]+Celebrity|[Ii]nfobox[ _]+director|[Ii]nfobox[ _]+entertainer|[Ii]nfobox[ _]+Fashion[ _]+Designer|[Ii]nfobox[ _]+fashion[ _]+designer|[Ii]nfobox[ _]+film[ _]+actor|[Ii]nfobox[ _]+film[ _]+director|[Ii]nfobox[ _]+human[ _]+being|[Ii]nfobox[ _]+human|[Ii]nfobox[ _]+Indian[ _]+Businessmen|[Ii]nfobox[ _]+Journalist|[Ii]nfobox[ _]+journalist|[Ii]nfobox[ _]+people|[Ii]nfobox[ _]+performer|[Ii]nfobox[ _]+person/measurements|[Ii]nfobox[ _]+person[ _]+ii|[Ii]nfobox[ _]+person|[Ii]nfobox[ _]+Person|[Ii]nfobox[ _]+photographer|[Ii]nfobox[ _]+Real[ _]+Person|[Ii]nfobox[ _]+trade[ _]+unionist|[Ii]nfobox[ _]+victim|[Pp]ersonbox)(?=\s*(?:\||\<\!\-\-))";
string ScientistTemplates_Regex = @"\{\{\s*(?:[Ii]nfobox[ _]+Academic|[Ii]nfobox[ _]+chemist|[Ii]nfobox[ _]+historian|[Ii]nfobox[ _]+mathematician|[Ii]nfobox[ _]+Professor|[Ii]nfobox[ _]+scientist|[Ii]nfobox[ _]+Scientist)(?=\s*(?:\||\<\!\-\-))";
bool Bio1 = Regex.IsMatch(ArticleText, PeopleTemplates_Regex, RegexOptions.IgnoreCase);
bool Bio2 = Regex.IsMatch(ArticleText, ScientistTemplates_Regex, RegexOptions.IgnoreCase);
bool NoBioTemplates = (Bio1 == false && Bio2 == false);
if (!Skip && NoBioTemplates)
{
if (ManuallyCheckPagesWithoutAGoodInfobox)
{
// OK to proceed (manually)
}
else
{
Summary += @"No bio templates found. ";
Skip = true;
}
}
// check for {{Authority control
if (!Skip)
{
string AuthorityAliases_Regex = @"\{\{\s*(?:[Aa]uthoritycontrol|[Aa]uthority[ _]+controll|[Aa]uthority[ _]+control|[Aa]uthority[ _]+Control|[Aa]utorité|[Ee]xternal[ _]+identifiers|[Nn]ormdaten)"; // 0 grps
bool HasAuthority = Regex.IsMatch(ArticleText, AuthorityAliases_Regex, RegexOptions.IgnoreCase);
if (HasAuthority)
{
Summary += @"{{Authority control}} exists. ";
Skip = true;
}
}
// get wikibase_item via WP API
// ex: https://en.wikipedia.org//w/api.php?action=query&format=json&prop=pageprops&titles=Panthera%20leo&redirects=0&formatversion=2&ppprop=wikibase_item
// TODO: find a proper URL_Encode function that behaves similarly
string ArticleTitle_URL = ArticleTitle.Replace(" ", @"%20").Replace(",", @"%2C").Replace("'", @"%27").Replace("-", @"%2D").Replace("–", @"%96").Replace("(", @"%28").Replace(")", @"%29").Replace(".", @"%2E").Replace("&", @"%26").Replace("?", @"%3F").Replace("+", @"%2B").Replace(":", @"%3A").Replace("!", @"%21").Replace("/", @"%2F").Replace(@"\", @"%5C");
string URL1 = @"https://en.wikipedia.org//w/api.php?action=query&format=json&prop=pageprops&titles=" +
ArticleTitle_URL + @"&redirects=0&formatversion=2&ppprop=wikibase_item";
string HTML1 = "";
if (!Skip && !SandboxDebug)
{
try
{
HTML1 = Tools.GetHTML(URL1);
}
catch
{
Summary = "GetHTML1 failed. ArticleTitle_URL = " + ArticleTitle_URL + " . ";
if (!LiveDebug) Skip = true;
}
}
// html1 error checks ///////////////////////////////////////////////////////
string QID = Regex.Match(HTML1, @"wikibase_item"":""([^""]+)").Groups[1].Value;
if (string.IsNullOrEmpty(QID) && !Skip && !SandboxDebug)
{
Summary = @"QID retrieval failed. ";
Skip = true;
}
if (!Regex.IsMatch(QID, @"^Q\d+$") && !Skip && !SandboxDebug) // case sensitive, jtbs
{
Summary = @"Unexpected QID format. ";
Skip = true;
}
// determine # of WD properties used ////////////////////////////////////////
List<string> ACPropertyList = new List<string>(new string[] {
// from Module:Authority control's local conf = { ... } table:
"P864",
"P2558",
"P3293",
"P1015",
"P2092",
"P950",
"P268",
"P428",
"P651",
"P271",
"P2456",
"P227",
"P902",
"P213",
"P347",
"P1248",
"P244",
"P886",
"P640",
"P434",
"P549",
"P1225",
"P1223",
"P1222",
"P1048",
"P349",
"P691",
"P409",
"P496",
"P2750",
"P1053",
"P650",
"P350",
"P947",
"P396",
"P906",
"P781",
"P3430",
"P269",
"P1362",
"P245",
"P1157",
"P214"
});
// get Wikidata
// ex: https://www.wikidata.org//w/api.php?action=wbgetclaims&format=json&entity=Q184201
string URL2 = @"https://www.wikidata.org//w/api.php?action=wbgetclaims&format=json&entity=" + QID;
string HTML2 = "";
if (!Skip && !SandboxDebug)
{
try
{
HTML2 = Tools.GetHTML(URL2);
}
catch
{
Summary = "GetHTML2 failed. URL2 = " + URL2 + " . ";
if (!LiveDebug) Skip = true;
}
}
// scrape Wikidata
// example text surrounding a populated property from
// https://www.wikidata.org/w/api.php?action=wbgetclaims&entity=Q184201 :
// "P227": [
// {
// "mainsnak": {
// "snaktype": "value",
// "property": "P227",
// "hash": "275a0595679f80411271280f2ee7344a94dfbeb6",
// "datavalue": {
// "value": "4776869-1",
// "type": "string"
// },
// "datatype": "external-id"
// },
int iProps = 0;
if (!Skip && !SandboxDebug)
{
foreach (string p in ACPropertyList)
{
string p_regex = @"""property"":\s*""" + p + @""",[^\{\}]*""datavalue"":\s*\{\s*""value"":\s*""[^""]+""";
bool Found = Regex.IsMatch(HTML2, p_regex);
if (Found) iProps++;
}
if (iProps == 0)
{
Summary = @"0 IDs on Wikidata. ";
Skip = true;
}
}
// main /////////////////////////////////////////////////////////////////////
if (!Skip)
{
if (SandboxDebug)
{
iProps = 1;
QID = "1";
}
// std {{DEFAULTSORT
string DF_Regex = @"\{\{\s*(?:DEFAULTSORT|[Dd]efaultSort|[Dd]efaultsort|DEFAULT[ _]+SORT|[Dd]efault[ _]+sort|[Ss]ORTIERUNG:Lasorling|SORTIERUNG)(?=[:\|\}])";
ArticleText = Regex.Replace(ArticleText, DF_Regex, @"{{DEFAULTSORT", RegexOptions.IgnoreCase);
// Move {{-stub}} tag closer to end of page, otherwise GenFixes adds an extra line before {{Authority control}} that can't be fixed w/o a reparse.
// Leading "\s*" replaced with "\n" fix cases like "{{reflist}}{{blah-stub}}" on the same line.
string MoveStubAfterCat_Regex = @"\s*(\{\{[^\{\}]*[ -]stub\s*\}\})\s*(\[\[\s*Category[^\[\]]+\]\])";
ArticleText = Regex.Replace(ArticleText, MoveStubAfterCat_Regex, "\n" + @"$2" + "\n" + @"$1", RegexOptions.IgnoreCase);
string AuthorityComplete = @"{{Authority control}}";
string AddBeforeCats_Regex = @"(^[\d\D]+?)(?=[\r\n]+[ ]*(?:\{\{DEFAULTSORT|\[\[\s*Category))"; // better results than adding after last cat
string Plural = (iProps > 1) ? "s" : "";
string SuccessSummary = @"+{{[[Template:Authority control|Authority control]]}}";
if (TomBot) SuccessSummary = @"[[Wikipedia:Bots/Requests for approval/Tom.Bot 6|Task 6]]: " + SuccessSummary;
if (iProps > 0) SuccessSummary += " (" + iProps + @" source" + Plural + @" from Wikidata)";
SuccessSummary += ", [[WP:GenFixes]] on,";
bool NoCat = !Regex.IsMatch(ArticleText, AddBeforeCats_Regex, RegexOptions.IgnoreCase);
if (NoCat)
{
if (ManuallyPlaceAuthorityAtEndOfPage)
{
ArticleText += "\n" + AuthorityComplete;
Summary = SuccessSummary + " (uncategorized page) ";
}
else
{
Summary += @"No cats/DEFAULTSORT to anchor {{Authority control}} to. Batch manually/code later. ";
Skip = true;
}
}
else
{
ArticleText = Regex.Replace(ArticleText, AddBeforeCats_Regex, @"$1" + "\n" + AuthorityComplete, RegexOptions.IgnoreCase);
Summary = SuccessSummary;
}
}
// exception tracking ///////////////////////////////////////////////////////
if (Skip && SaveSkipSummaries && !SandboxDebug)
{
string Message = ArticleTitle + "\t" + Summary + "\n";
string File = @"Module output - Add {{Authority control}} (skip summaries).txt";
string Path = @"F:\"; // desktop
string FullPath = Path + File;
const bool APPEND = true;
Tools.WriteTextFileAbsolutePath(Message, FullPath, APPEND);
}
if (LiveDebug || SandboxDebug) Skip = false;
return ArticleText;
}
Content Disclaimer
Informasi ini disarikan dari Wikipedia dan disajikan kembali untuk tujuan edukasi. Konten tersedia di bawah lisensi CC BY-SA 3.0. Kami tidak bertanggung jawab atas ketidakakuratan data yang bersumber dari kontribusi publik tersebut.
- The information displayed on this website is sourced in part or in whole from Wikipedia and has been adapted for the purpose of restating it. We strive to provide accurate and relevant information, however:
- There is no guarantee of absolute accuracy. Wikipedia is an open, collaborative project that can be edited by anyone, so information is subject to change.
- It is not intended to constitute professional advice. The content displayed is for informational and educational purposes only. For important decisions (e.g., medical, legal, or financial), please consult a professional.
- Content copyright. Wikipedia is licensed under the Creative Commons Attribution-ShareAlike License (CC BY-SA). This means that content may be reused with appropriate attribution and shared under a similar license.
- Responsible use. Any risk arising from the use of information from this website is entirely the responsibility of the user.