<?PHP
/***************
* Sakai help scraper
* Written by Matt Clare, Brock University
* Not written by someone with a computer science degree
* There may be memery size issues - you may want to run it from the command line!
* ie. php -n -f help_scrape.php -- the -n is for "No php.ini file will be used"
* The categories and template used in the script can be reived and re-used at http://kumu.brocku.ca/sakai/
*
/***************/
header("content-type: text/xml");
//error_reporting(2047);
//header('Content-Type: text/plain; charset=utf-8');
$help_base_url = 'https://lms.brocku.ca/portal/help/';
$toc ='TOCDisplay/main'; //where is the Table of Contents directory
$output_dir = './data/'; //Where on the folder are on the server we saving this?
$username = 'SakaiImport'; //What MediaWiki username will be credited with creating the entry?
$blank = array('<p> </p>',"Â",'<','>','Back to top'); // things we just want blanked out
$remove_dup_titles = true;
$queens_english = true; //Chance to add Us to color, etc.
$link_to_articles = true;
$write_files = false; //will not write files unless this true
$exclude_filename = "exclude.txt"; //List of articles to exclude from export/import - our list is at http://ctlet.brocku.ca/~mclare/sakai/help_scrape/exclude.txt
//Categories to append to articles if the word is found in the article
$categories = array('Announcements','Assignments','Blogger','Chat Room','Drop Box','Forms','Forums','Glossary','Gradebook','Groups','Permissions','Tests and Quizzes','Syllabus','Dropbox','News','Messages','Podcasts','Polls','Presentation','Profile','Resources','Schedule','Sign-up','Site Stats','Web Content','Wiki','Worksite Setup','Firefox','Internet Explorer','Netscape','Mac OS','Windows','My Workspace','Portfolio','Student View','see');
$categories_variations = array('Gradebook' => array('Grade book','Grades'),'Assignments' => array('Assignment'),'Blogger' => array('Blog'),'Chat Room' => array('Chat'),'News' => array('RSS','Feed','Podcasts'),'Schedule' => array('Calendar'),'Sign-up' => array('signup','sign up'),'Site Stats' => array('Stats'),'Roles' =>array('Role','Roles'),'Text Editor' =>array('FCKeditor'),'TAs'=>array(' TAs ','teaching assistant'),'Tests and Quizzes' => array('Tests','Quizzes','Questions'),'Non-Brock' => array('Other Official Participants'),'Forums' => array('Forum'));
//Words to replace as links to the Wiki article about them
$link_to_article = array("'''''Note:''''' " =>'Note: ','[[XTHML]]'=>'XHTML','[[HTML]]' =>'HTML','[[PPT]]' =>'ppt','[[MP3]]' =>'MP3','[[mp4]]' =>'MP4','[[RSS]]'=>'RSS',' [[URL]] '=>' URL ',' [[Sakai]] '=>' Sakai ',' [[LMS]] ' =>' CLE ',' [[WebDAV]] ' =>' WebDAV ','[[ZIP]|.zip] ' =>'.zip ','[[Permissions and roles]]' =>'Permissions and roles: Overview','Permissions and roles');
//Wiki script to append to article, in this case the template to indicate that it was auto imported.
//The idea is that all auto-impoted documents will get tagged with with a category "Auto Imported <current date>" and have a template at the bottom
// for "Help Page" and attribute it to "Sakai Help Source". Lastly the template Auto Imported and Not Reviewed should be removed once a human has reviewd the aticle. The category "Auto Imported and Not Reviewed" provides a to-do list.
$append_to_wiki = "{{Template:Help Page}}\n{{Template:Sakai Help Source}}\n{{Template:Auto Imported and Not Reviewed}}\n[[Category:Auto Imported ".date('Y m d')."]]\n";
$regexp = "<a\s[^>]*href=(\"??)([^\" >]*?)\\1[^>]*>(.*)<\/a>"; //Link finding regular expression
$articles = array();
//Load in config file of titles to exclude
$handle = fopen($exclude_filename, "r");
$contents = fread($handle, filesize($exclude_filename));
fclose($handle);
$exclude = explode("\n",$contents);
function queens_english ($contents) {
$translate = array('Honor'=>'Honour','spoilt'=>'spoiled','color'=>'colour','favorite' =>'favourite');
foreach($translate as $key => $value) {
$contents = str_ireplace($key,$value,$contents );
}
return $contents;
}
function link_to_articles ($contents) {
global $link_to_article;
foreach ($link_to_article as $key => $value) {
$contents = str_ireplace($value,$key,$contents );
}
return $contents;
}
function find_categories ($contents) {
global $categories,$categories_variations,$append_to_wiki;
$append = '';
foreach ($categories as $value) {
if (stripos($contents,$value) > 1) {
if (strpos($append,$value) < 1) $append.="[[Category:$value]]\n"; //Add category, so long as it's not already there
}
}
foreach ($categories_variations as $key => $value) {
foreach ($value as $value2){
if (stripos($contents,$value2) > 1) {
if (strpos($append,$key) < 1) $append.="[[Category:$key]]\n"; //Add category, so long as it's not already there
}
}
}
return $append;
}
function remove_title($title,$contents,$length = 200) { //takes a look at the first $length character to try and remove tile
$common_variations = explode(' ',$title);
$i=1;
$c = array();
$c[0] = substr($contents, 0,$length );
$c[1] = substr($contents, $length ,strlen($contents));
$variations = array(1 => "$title");
foreach ($common_variations as $v_value) {
$replacers = array(': ',' a ',' an ','ing ','ing an ');
for($k=0;$k < count($replacers);$k++){
$i++;
$j=0;
$variations[$i] = '';
foreach ($common_variations as $v_value2) {
$j++;
$variations[$i] .= $v_value2;
for ($l=1;$l<count($common_variations);$l++){
if ($l == $j) $variations[$i] .= $replacers[$k];
else $variations[$i] .= ' ';
}
}
}
}
$c[0] = str_ireplace($variations,"",$c[0]);
$c[0] = str_replace("====",'',$c[0]); // We tend to leave a lot of these
return $c[0].$c[1];
}
function get_url_contents($url) {
$handle = fopen($url, "r");
$contents = stream_get_contents($handle);
fclose($handle);
return utf8_encode($contents); //let's explicitly work in UTF8 to reduce errors
}
function write_file ($filename, $content) {
global $output_dir,$write_files;
if ($write_files) {
$filename = addslashes($filename);
$filename = str_replace('/','_',$filename);
$f=fopen(utf8_encode($output_dir.$filename), "wb");
if (!fputs($f, $content)) die("Unable to write to: ".$filename);
fclose($f);
}
}
//Read in from Sakai
$contents = get_url_contents($help_base_url.$toc);
if(preg_match_all("/$regexp/siU", $contents, $matches)) {
foreach($matches[3] as $match_key => $match_value){
if (!in_array($match_value,$exclude)){
if (strpos(' '.$matches[2][$match_key],"#") < 1 && strlen($matches[2][$match_key]) > 0) $articles[$match_value] = $help_base_url.$matches[2][$match_key];
}
}
}
$xml = '<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.3/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.3/ http://www.mediawiki.org/xml/export-0.3.xsd" version="0.3" xml:lang="en">
<siteinfo>
<sitename>Sakai Import</sitename>
<base>http://kumu.brocku.ca/sakai/Main_Page</base>
<generator>PHP '.phpversion().'</generator>
<case>first-letter</case>';
//Following based on Brock University's Sakai/Isaak wiki's catageories http://kumu.brocku.ca/sakai/Special:Categories
/*'
<namespaces>
<namespace key="-2">Media</namespace>
<namespace key="-1">Special</namespace>
<namespace key="0" />
<namespace key="1">Talk</namespace>
<namespace key="2">User</namespace>
<namespace key="3">User talk</namespace>
<namespace key="4">Sakai at Brock</namespace>
<namespace key="5">Discussion</namespace>
<namespace key="6">Image</namespace>
<namespace key="7">Image talk</namespace>
<namespace key="8">MediaWiki</namespace>
<namespace key="9">MediaWiki talk</namespace>
<namespace key="10">Template</namespace>
<namespace key="11">Template talk</namespace>
<namespace key="12">Help</namespace>
<namespace key="13">Help talk</namespace>
<namespace key="14">Category</namespace>
<namespace key="15">Category talk</namespace>
</namespaces>';
*/
$xml .= '</siteinfo>';
$i = 0;
foreach($articles as $key => $value) {
$i ++;
// $contents = utf8_encode(get_url_contents($value));
$contents = get_url_contents($value);
write_file($key.'.html',$contents);
if (function_exists('tidy_repair_string')) {
$config = array('indent' => TRUE,
'indent-spaces' =>0,
'indent' => 0,
'hide-comments' => TRUE,
'newline' => 0,
'tab-size' => 0,
'output-html' => TRUE);
$tidy = tidy_parse_string($contents, $config);
$tidy->cleanRepair();
$contents = $tidy;
}
/** Translate heading tags to wiki markup **/
$eql = '';
for ($i=1;$i<6;$i++) {
$eql .= '=';
$contents = str_ireplace(array("<h$i>\n","\n</h$i>"),$eql,$contents);
$contents = str_ireplace(array("<h$i>","</h$i>"),$eql,$contents);
}
$contents = str_replace(' '," ",$contents); //blank out these characters as I don't like how PHP handles them
$contents = str_ireplace('<li>',"*",$contents); //Helps with Wiki Markup
$contents = html_entity_decode($contents); //Aparently XML doesn't like these things?
$contents = strip_tags($contents); //remove remaining html tags (mainly p tags)
$contents = str_replace('&','and',$contents); //Aparently XML doesn't like these things?
$contents = str_replace($blank,'',$contents); //blank out these characters
/** Explode and do line by line **/
$contents_array= explode("\n",$contents);
$contents = '';
$blank_watch = array();
for($i=0; $i < count($contents_array);$i++) { //can't help but think an OO approach might have been better
if ($i > 2) {
if (strlen($contents_array[$i]) <= 2) { //if the line is only so long
if (in_array($i-1,$blank_watch)) array_push($blank_watch,$i); //check to see if the last line as too, if so record this one
else {
if (count($blank_watch) > 1) {// if we've got a few blanks, remove the
foreach ($blank_watch as $value) {
unset($contents_array[$value]);
}
}
$blank_watch = array($i);
}
}
}
else $contents .= trim($contents_array[$i]);
}
foreach ($contents_array as $value){
if (strpos($value,'*/') < 1) $contents .= trim($value)."\n";
}
if ($remove_dup_titles) $contents = remove_title($key,$contents); //Remove titles and reduce some of the blank lines after titles
if ($queens_english) $contents = queens_english($contents);
if ($link_to_articles) $contents = link_to_articles ($contents);
$contents = $contents.$append_to_wiki.find_categories($contents);
write_file($key.'.txt',$contents);
unset($contents);
$xml .='<page>
<title>'.$key.'</title>
<id>4</id>
<revision>
<id>1</id>
<timestamp>'.date("Y-m-d\TG:i:s\Z").'</timestamp>
<contributor>
<username>'.$username.'</username>
<id>2</id>
</contributor>
<comment>Import from Sakai help pages '.date("F j, Y, g:i a").'</comment>
<text xml:space="preserve">'.sprintf("%s",$contents).'</text>
</revision>
</page>'."\n";
}
$xml .= '</mediawiki>';
echo $xml;
?>
|