Welcome, guest! Login / Register - Why register?
Psst.. new poll here.
Psst.. new forums here.
Microsoft is blocking us again (TY IP Reputation!) so just use oauth login instead. :)

Paste

Pasted by namor ( 18 years ago )
/*
 * 
 * Copyright (C) 2008 Roman Naumann
 * 
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 * 
 * See COPYING.TXT for details.
 * 
 */
package flahsbuster;

import com.Ostermiller.util.CSVParser;
import com.Ostermiller.util.CSVPrinter;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.net.URL;

/*
 * @Author Roman Naumann
 *
 */
import java.util.ArrayList;
import java.util.List;
import java.util.Scanner;
import java.util.regex.MatchResult;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class GismuImageDownloader {

    final URL urlToWikisource;
    final Pattern wikiImagePattern = Pattern.compile("Image:[^|&]*|[[[a-z]{5}]]");
    final Pattern gismuPattern_1 = Pattern.compile("|[[[a-z]{5}]]");
    final Pattern gismuPattern_0 = Pattern.compile("[a-z]{5}");
    final Pattern imagePattern = Pattern.compile("Image:[^|&]*");
    //final Pattern imageDownloadUrlPattern_1 = Pattern.compile("<span style="color:blue;">&lt;ii url=&quot;<a href="http://.*">");
    //final Pattern imageDownloadUrlPattern_0 = Pattern.compile("http://[^">]*");
    final Pattern imageDownloadUrlPattern = Pattern.compile("http://upload.wikimedia.org/wikipedia/[^"]*");

    public static void main(String args[]) throws Exception {
        new GismuImageDownloader();
    }

    public static Iterable<MatchResult> getMatchResults(Pattern p, String s) {
        List<MatchResult> results = new ArrayList<MatchResult>();
        for (Matcher m = p.matcher(s); m.find();) {
            results.add(m.toMatchResult());
        }
        return results;
    }

  
    
    public static String getOneStringResult(Pattern p, String s) {
        List<MatchResult> results = new ArrayList<MatchResult>();
        for (Matcher m = p.matcher(s); m.find();) {
            results.add(m.toMatchResult());
        }
        assert (results.size() <= 1);
        if(results.size()>0)
            return results.get(0).group();
        else
            return null;
    }

    public GismuImageDownloader() throws Exception {

        //Step one, get the image list
            
            urlToWikisource = new URL("http://jbo.wikipedia.org/w/index.php?title=pixra_liste_loi_gismu&action=edit");

            String wikiSource = new Scanner(urlToWikisource.openStream()).useDelimiter("").next();
            //System.out.println( wikiSource );

            Iterable<MatchResult> imageWikiLines = getMatchResults(wikiImagePattern, wikiSource);

            List<String> gismuList = new ArrayList();


            //extract the gismu names
            for (MatchResult i : imageWikiLines) {
                String gismu_spam = getOneStringResult(gismuPattern_1, i.group());
                String gismu = getOneStringResult(gismuPattern_0, gismu_spam);
                gismuList.add(gismu);
                System.out.println(gismu);
            }

            List<String> markupList = new ArrayList();

            //extract the image markups
            for (MatchResult i : imageWikiLines) {
                String imageMarkup = getOneStringResult(imagePattern, i.group());
                markupList.add(imageMarkup);
                System.out.println(imageMarkup);
            }

        
        //Step two: download the image url html-xml spam
            List<String> fileNameList = new ArrayList<String>();
            for(int n = 0;n<markupList.size();n++){
                String imageMarkup = markupList.get(n);
                
                URL urlToImageDownloadSpam = new URL("http://commons.wikimedia.org" +
                        "/w/api.php?action=query&titles="
                        +imageMarkup.replaceAll(" ", "%20")
                        +"&prop=imageinfo&iiprop=url&&iiurlwidth=500");
                String imageDownloadSpam = new Scanner(urlToImageDownloadSpam
                        .openStream()).useDelimiter("").next();
                String url = getOneStringResult(imageDownloadUrlPattern, imageDownloadSpam);
                if(url==null){
                    gismuList.remove(n);
                    markupList.remove(n);
                    n--;
                    continue;
                }
                System.out.println("URL: "+url);
                //step three: finally download the images
                FileDownload.downloadIfNotAlreadyThere(url);
                fileNameList.add(FileDownload.getFileNameToAddr(url));
            }
            assert(markupList.size()==gismuList.size());
            System.out.println("updating the csvImageList");
            
            //step four: update the csvImageList!
            CSVPrinter cp = new CSVPrinter(new FileOutputStream("csvGismuImage"));
            for(int n=0;n<gismuList.size();n++){
                String[] arr = {gismuList.get(n),fileNameList.get(n)};
                cp.writeln(arr);
            }
            cp.close();
    }
}

 

Revise this Paste

Your Name: Code Language: