scraping in java. use selenium and chromedriver.
わけあって javascript でページロードするようなサイトのスクレイピングをして画像を取得したくなったのでしました。
scaj という名前でつくります。
- build.gradle
group 'scaj' version '1.0-SNAPSHOT' apply plugin: 'java' apply plugin: 'eclipse' apply plugin: 'idea' apply plugin: 'application' sourceCompatibility = 1.8 targetCompatibility = 1.8 repositories { mavenCentral() } dependencies { compile 'org.seleniumhq.selenium:selenium-server:3.0.1' compile 'org.slf4j:slf4j-simple:1.7.12' compile('com.squareup.okhttp3:okhttp:3.5.0') compile 'commons-codec:commons-codec:1.10' compileOnly('org.projectlombok:lombok:1.16.12') testCompile group: 'junit', name: 'junit', version: '4.11' } task wrapper(type: Wrapper) { gradleVersion = '3.2' } jar { baseName = 'scaj' version = '1.0' manifest { attributes 'Implementation-Title': 'Scaj', 'Implementation-Version': 1.0 manifest.mainAttributes("Main-Class": "net.vg4.scaj.App") } from configurations.compile.collect { it.isDirectory() ? it : zipTree(it) } } mainClassName = "net.vg4.scaj.App"
- App.java
package net.vg4.scaj; import net.vg4.scaj.Tasks.DownloadImages; import org.openqa.selenium.WebDriver; /** * Created by */ public class App { public static void main(String[] args) { DownloadImages sch = new DownloadImages(); try { WebDriver driver = sch.apply(); driver.quit(); } catch (Exception e) { e.printStackTrace(); } } }
- DownloadImages.java
public WebDriver apply() throws Exception { WebElement popup = driver.findElement(By.cssSelector("#...")); popup.click(); WebElement menu = driver.findElement(By.cssSelector("#...")); menu.click(); Wait<WebDriver> wait = new WebDriverWait(driver, 10); wait.until((ExpectedCondition<Boolean>) d -> d.getTitle().startsWith("...")); WebElement menu2 = driver.findElement(By.cssSelector("a[href='/...']")); menu2.click(); Thread.sleep(1000); wait.until((ExpectedCondition<Boolean>) d -> d.getTitle().startsWith("...")); Thread.sleep(1000); WebElement loop = wait.until(ExpectedConditions.elementToBeClickable(By.cssSelector("#..."))); loop.click(); Thread.sleep(500); WebElement first = wait.until(ExpectedConditions.elementToBeClickable(By.cssSelector("#..."))); log.info("first : {}", first); first.click(); int count = 0; Wait<WebDriver> wait2 = new WebDriverWait(driver, 1); while (true) { Thread.sleep(500); WebElement pic = wait.until(ExpectedConditions.elementToBeClickable(By.cssSelector("#..."))); log.info("pic : {}", pic); String src = pic.getAttribute("src"); WebElement photonum = wait2.until(ExpectedConditions.elementToBeClickable(By.cssSelector("#..."))); try { downloadFileSync(src, photonum.getText()); count++; if (count > 3000) { log.info("over 3000"); return driver; } } catch (Exception e) { e.printStackTrace(); return driver; } try { WebElement next = wait2.until(ExpectedConditions.elementToBeClickable(By.cssSelector("#..."))); next.click(); log.info("next : {}", next); continue; } catch (Exception e) { ; } WebElement close = wait.until(ExpectedConditions.elementToBeClickable(By.cssSelector("#..."))); close.click(); log.info("close : {}", close); Thread.sleep(1000); JavascriptExecutor jse = (JavascriptExecutor) driver; try { while (true) { for (int i = 0; i < 100; i++) { jse.executeScript("window.scrollBy(0,1000000)", ""); Thread.sleep(100); } try { WebElement nextpage = wait2.until(ExpectedConditions.elementToBeClickable(By.cssSelector("#..."))); nextpage.click(); log.info("nextpage : {}", nextpage); Thread.sleep(1000); WebElement first2 = wait.until(ExpectedConditions.elementToBeClickable(By.cssSelector("#..."))); log.info("first2 : {}", first2); first2.click(); break; } catch (Exception e) { ; } } } catch (Exception e) { e.printStackTrace(); break; } } return driver; }
とっても雑ですが、学びは、全部 wait でとったほうが楽だったことと、autopagirize 系はなにげに面倒だなってことでした。あと、途中で driver.get するのはダメだなってこと。
こういうのは ruby のほうが短く書けていいって思いながら書きました. kotlin にするとかもいいかもですね。