Psst.. new poll here.
Psst.. new forums here.
Microsoft is blocking us again (TY IP Reputation!) so just use oauth login instead. :)
Paste
Pasted by namor ( 18 years ago )
/*
*
* Copyright (C) 2008 Roman Naumann
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* See COPYING.TXT for details.
*
*/
package flahsbuster;
import com.Ostermiller.util.CSVParser;
import com.Ostermiller.util.CSVPrinter;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.net.URL;
/*
* @Author Roman Naumann
*
*/
import java.util.ArrayList;
import java.util.List;
import java.util.Scanner;
import java.util.regex.MatchResult;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class GismuImageDownloader {
final URL urlToWikisource;
final Pattern wikiImagePattern = Pattern.compile("Image:[^|&]*|[[[a-z]{5}]]");
final Pattern gismuPattern_1 = Pattern.compile("|[[[a-z]{5}]]");
final Pattern gismuPattern_0 = Pattern.compile("[a-z]{5}");
final Pattern imagePattern = Pattern.compile("Image:[^|&]*");
//final Pattern imageDownloadUrlPattern_1 = Pattern.compile("<span style="color:blue;"><ii url="<a href="http://.*">");
//final Pattern imageDownloadUrlPattern_0 = Pattern.compile("http://[^">]*");
final Pattern imageDownloadUrlPattern = Pattern.compile("http://upload.wikimedia.org/wikipedia/[^"]*");
public static void main(String args[]) throws Exception {
new GismuImageDownloader();
}
public static Iterable<MatchResult> getMatchResults(Pattern p, String s) {
List<MatchResult> results = new ArrayList<MatchResult>();
for (Matcher m = p.matcher(s); m.find();) {
results.add(m.toMatchResult());
}
return results;
}
public static String getOneStringResult(Pattern p, String s) {
List<MatchResult> results = new ArrayList<MatchResult>();
for (Matcher m = p.matcher(s); m.find();) {
results.add(m.toMatchResult());
}
assert (results.size() <= 1);
if(results.size()>0)
return results.get(0).group();
else
return null;
}
public GismuImageDownloader() throws Exception {
//Step one, get the image list
urlToWikisource = new URL("http://jbo.wikipedia.org/w/index.php?title=pixra_liste_loi_gismu&action=edit");
String wikiSource = new Scanner(urlToWikisource.openStream()).useDelimiter("").next();
//System.out.println( wikiSource );
Iterable<MatchResult> imageWikiLines = getMatchResults(wikiImagePattern, wikiSource);
List<String> gismuList = new ArrayList();
//extract the gismu names
for (MatchResult i : imageWikiLines) {
String gismu_spam = getOneStringResult(gismuPattern_1, i.group());
String gismu = getOneStringResult(gismuPattern_0, gismu_spam);
gismuList.add(gismu);
System.out.println(gismu);
}
List<String> markupList = new ArrayList();
//extract the image markups
for (MatchResult i : imageWikiLines) {
String imageMarkup = getOneStringResult(imagePattern, i.group());
markupList.add(imageMarkup);
System.out.println(imageMarkup);
}
//Step two: download the image url html-xml spam
List<String> fileNameList = new ArrayList<String>();
for(int n = 0;n<markupList.size();n++){
String imageMarkup = markupList.get(n);
URL urlToImageDownloadSpam = new URL("http://commons.wikimedia.org" +
"/w/api.php?action=query&titles="
+imageMarkup.replaceAll(" ", "%20")
+"&prop=imageinfo&iiprop=url&&iiurlwidth=500");
String imageDownloadSpam = new Scanner(urlToImageDownloadSpam
.openStream()).useDelimiter("").next();
String url = getOneStringResult(imageDownloadUrlPattern, imageDownloadSpam);
if(url==null){
gismuList.remove(n);
markupList.remove(n);
n--;
continue;
}
System.out.println("URL: "+url);
//step three: finally download the images
FileDownload.downloadIfNotAlreadyThere(url);
fileNameList.add(FileDownload.getFileNameToAddr(url));
}
assert(markupList.size()==gismuList.size());
System.out.println("updating the csvImageList");
//step four: update the csvImageList!
CSVPrinter cp = new CSVPrinter(new FileOutputStream("csvGismuImage"));
for(int n=0;n<gismuList.size();n++){
String[] arr = {gismuList.get(n),fileNameList.get(n)};
cp.writeln(arr);
}
cp.close();
}
}
Revise this Paste