#!/usr/bin/php * *****************************************************************************/ $court_base_href = 'http://www.utd.uscourts.gov'; $court_dump_url = $court_base_href . '/reports/media/'; $court_dump_file = '/tmp/utdcourt.txt'; $pdftotext = '/usr/bin/pdftotext'; $wget = '/usr/bin/wget'; $download_dir = '/tmp'; $pdf_keywords = array('shaughessey','chris kao','sco v ibm','cravath', 'international business machines','the sco group','caldera', 'david r. marriott','david marriott','brent o. hatch','363-6363', 'boies, schiller','boies schiller','hatch james','david boies', 'mark j. heise','darl mcbride','kevin mcbride','kevin p mcbride', '2:03cv00294','2:03-cv-00294'); if (is_executable($pdftotext) and is_executable($wget)) { if (is_dir($download_dir) and is_writable($download_dir)) { $parse_pdfs = true; } else { echo "Warning: will not attempt to parse pdfs - $download_dir is "; echo " not writable or does not exist\n"; $parse_pdfs = false; } } else { $parse_pdfs = false; } if (! $fd = fopen($court_dump_file,'a+')) { echo "Cannot open file for writing: $court_dump_file\n"; exit(1); } else { $old_files_ar = unserialize(fgets($fd)); fclose($fd); } if (!$html_ar = file($court_dump_url)) { echo "Cannot retrieve list from $court_dump_url\n"; exit(1); } $last_line = count($html_ar) - 1; $text = $html_ar[$last_line]; $marker = '

'; $ptr = strlen($marker); $pos = strpos(strtolower($text),$marker); if ($pos === false) { echo "Cannot locate marker: $marker\n"; exit(1); } else { $text = substr($text,($pos + $ptr)); } $len = strlen($text); $marker = '
'; $pos = strpos(strtolower($text),$marker); if ($pos === false) { echo "Cannot locate marker: $marker\n"; exit(1); } else { $ctr = $len - ($len - $pos); $text = substr($text,0,$ctr); } $text_ar = explode("
",$text); $file_dates = array(); $file_times = array(); $file_sizes = array(); $file_urls = array(); $file_names = array(); $file_exts = array(); foreach($text_ar as $line) { $line = trim($line); $marker = strpos($line,' ') + 1; $date_str = trim(substr($line,0,$marker)); $line = trim(substr($line,$marker)); $marker = strpos($line,' '); $time_str = trim(substr($line,0,$marker)); $line = trim(substr($line,$marker)); $marker = strpos($line,' ') + 1; $size_str = trim(substr($line,0,$marker)); $line = trim(substr($line,$marker)); $junk = explode('"',$line); $url_str = $court_base_href . trim($junk[1]); $file_name = trim(basename($url_str)); $junk = explode('.',$file_name); $file_ext = trim($junk[1]); $file_dates[] = $date_str; $file_times[] = $time_str; $file_sizes[] = $size_str; $file_urls[] = $url_str; $file_names[] = $file_name; $file_exts[] = $file_ext; } if (count($file_dates) !== count($file_times) or count($file_dates) !== count($file_sizes) or count($file_dates) !== count($file_urls) or count($file_dates) !== count($file_names) or count($file_dates) !== count($file_exts)) { echo "Error: mismatched count for new file stats\n"; exit(1); } if (!is_array($old_files_ar)) { $old_file_names = array(); } else { $old_file_dates = $old_files_ar['file_dates']; $old_file_times = $old_files_ar['file_times']; $old_file_sizes = $old_files_ar['file_sizes']; $old_file_urls = $old_files_ar['file_urls']; $old_file_names = $old_files_ar['file_names']; $old_file_exts = $old_files_ar['file_exts']; if (!is_array($old_file_dates) or !is_array($old_file_times) or !is_array($old_file_sizes) or !is_array($old_file_urls) or !is_array($old_file_names) or !is_array($old_file_exts) or count($old_file_dates) !== count($old_file_times) or count($old_file_dates) !== count($old_file_sizes) or count($old_file_dates) !== count($old_file_urls) or count($old_file_dates) !== count($old_file_names) or count($old_file_dates) !== count($old_file_exts)) { echo "Error reading old file stats\n"; exit(1); } } $new_file_dates = array(); $new_file_times = array(); $new_file_sizes = array(); $new_file_urls = array(); $new_file_names = array(); $new_file_exts = array(); for($j=0; $j < count($file_names); $j++) { $file_name = $file_names[$j]; if (in_array($file_name,$old_file_names)) { continue; } else { $new_file_dates[] = $file_dates[$j]; $new_file_times[] = $file_times[$j]; $new_file_sizes[] = $file_sizes[$j]; $new_file_urls[] = $file_urls[$j]; $new_file_names[] = $file_names[$j]; $new_file_exts[] = $file_exts[$j]; } } $new = count($new_file_names); if ($new > 0) { $msg = "\n$new new files at $court_dump_url:\n"; for ($j=0; $j < count($new_file_names); $j++) { $f_date = $new_file_dates[$j]; $f_time = $new_file_times[$j]; $f_size = $new_file_sizes[$j]; $f_url = $new_file_urls[$j]; $f_name = $new_file_names[$j]; $f_ext = $new_file_exts[$j]; $old_file_dates[] = $f_date; $old_file_times[] = $f_time; $old_file_sizes[] = $f_size; $old_file_urls[] = $f_url; $old_file_names[] = $f_name; $old_file_exts[] = $f_ext; $related = "Unknown"; if (strtolower($f_ext) == "pdf" and $parse_pdfs) { echo "Fetching: $f_url\n"; $out = "$download_dir/$f_name"; $cmd = "$wget -t 5 -q '$f_url' -O $out"; exec($cmd,$junk,$rc); if ($rc !== 0 or !is_file($out)) { echo "Error: failed to fetch $f_url\n"; } else { echo "Scanning: $out\n"; //$cmd = "$pdftotext -nopgbrk -q $out $out.txt"; //exec($cmd,$junk,$rc); $cmd = "$pdftotext -nopgbrk -q $out -"; exec($cmd,$output,$rc); if ($rc !== 0 or !is_array($output)) { echo "Error: failed on $cmd\n"; } else { $hit = false; foreach($output as $line) { if ($hit === true) { break; } $line = strtolower($line); foreach($pdf_keywords as $keyword) { if ($hit === true) { break; } $pos = strpos($line,$keyword); if (! $pos === false) { $hit = true; $related = "**YES** keyword: $keyword"; } } } if ($hit === false) { @unlink($out); } } } } $msg .= "\nFile URL: $f_url\n"; $msg .= " File Name: $f_name\n"; $msg .= " File Date: $f_date\n"; $msg .= " File Size: $f_size\n"; $msg .= " SCO Related: $related\n"; } $new_ar = array('file_dates'=>$old_file_dates, 'file_times'=>$old_file_times, 'file_sizes'=>$old_file_sizes, 'file_urls'=>$old_file_urls, 'file_names'=>$old_file_names, 'file_exts'=>$old_file_exts); $new_ar = serialize($new_ar); echo $msg; if (! $fd = fopen($court_dump_file,'w')) { echo "\nError: Cannot re-open file for writing: $court_dump_file\n"; exit(1); } else { if (!fputs($fd,$new_ar)) { echo "\nWARNING: Could not re-create $court_dump_file\n"; } else { fclose($fd); } } } ?>