無法抓取網站 出現timeout
用了框架寫了個簡單程式,有些網站能抓,有些網站出現timeout,想請問這個可能是哪方面的問題,看了許久,找不出問題。 目前抓取的網站是: https://www.arrow.com/en/categories/diodes-transistors-and-thyristors/bipolar-transistors/rf-bjt?page=1 代碼如下: public class ArrowPageProcessor implements PageProcessor {
private Site site = Site.me().setRetryTimes(3).setSleepTime(1000).setTimeOut(10000).addHeader("user-agent","Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.83 Safari/537.36");
@Override
public void process(Page page) {
// TODO Auto-generated method stub
String lastpage = page.getHtml().xpath("//li[@class='ng-star-inserted'][last()-1]").toString();
page.putField("lastpage", page.getHtml().xpath("//li[@class='ng-star-inserted'][last()-1]").toString());
System.out.println("test:" + lastpage);
}
@Override
public Site getSite() {
// TODO Auto-generated method stub
return site;
}
public void Run() {
Spider.create(new ArrowPageProcessor())
// From URL began to grasp
.addUrl("https://www.arrow.com/en/categories/diodes-transistors-and-thyristors/bipolar-transistors/rf-bjt?page=1")
//.addUrl("https://www.mouser.tw/c/semiconductors/discrete-semiconductors/transistors/rf-transistors/?pg=2")
//.addUrl("https://blog.csdn.net/ty497122758/article/details/78495183")
// Open 5 threads of Crawler
.thread(5)
// Start Crawler
.run();
}
} 報錯如下: java.net.SocketTimeoutException: Read timed out at java.base/sun.nio.ch.NioSocketImpl.timedRead(NioSocketImpl.java:284) ~[na:na] at java.base/sun.nio.ch.NioSocketImpl.implRead(NioSocketImpl.java:310) ~[na:na] at java.base/sun.nio.ch.NioSocketImpl.read(NioSocketImpl.java:351) ~[na:na] at java.base/sun.nio.ch.NioSocketImpl$1.read(NioSocketImpl.java:802) ~[na:na] at java.base/java.net.Socket$SocketInputStream.read(Socket.java:937) ~[na:na] at java.base/sun.security.ssl.SSLSocketInputRecord.read(SSLSocketInputRecord.java:450) ~[na:na] at java.base/sun.security.ssl.SSLSocketInputRecord.bytesInCompletePacket(SSLSocketInputRecord.java:68) ~[na:na] at java.base/sun.security.ssl.SSLSocketImpl.readApplicationRecord(SSLSocketImpl.java:1409) ~[na:na] at java.base/sun.security.ssl.SSLSocketImpl$AppInputStream.read(SSLSocketImpl.java:1022) ~[na:na] at org.apache.http.impl.io.SessionInputBufferImpl.streamRead(SessionInputBufferImpl.java:137) ~[httpcore-4.4.15.jar:4.4.15] at org.apache.http.impl.io.SessionInputBufferImpl.fillBuffer(SessionInputBufferImpl.java:153) ~[httpcore-4.4.15.jar:4.4.15] at org.apache.http.impl.io.SessionInputBufferImpl.readLine(SessionInputBufferImpl.java:280) ~[httpcore-4.4.15.jar:4.4.15] at org.apache.http.impl.conn.DefaultHttpResponseParser.parseHead(DefaultHttpResponseParser.java:138) ~[httpclient-4.5.13.jar:4.5.13] at org.apache.http.impl.conn.DefaultHttpResponseParser.parseHead(DefaultHttpResponseParser.java:56) ~[httpclient-4.5.13.jar:4.5.13] at org.apache.http.impl.io.AbstractMessageParser.parse(AbstractMessageParser.java:259) ~[httpcore-4.4.15.jar:4.4.15] at org.apache.http.impl.DefaultBHttpClientConnection.receiveResponseHeader(DefaultBHttpClientConnection.java:163) ~[httpcore-4.4.15.jar:4.4.15] at org.apache.http.impl.conn.CPoolProxy.receiveResponseHeader(CPoolProxy.java:157) ~[httpclient-4.5.13.jar:4.5.13] at org.apache.http.protocol.HttpRequestExecutor.doReceiveResponse(HttpRequestExecutor.java:273) ~[httpcore-4.4.15.jar:4.4.15] at org.apache.http.protocol.HttpRequestExecutor.execute(HttpRequestExecutor.java:125) ~[httpcore-4.4.15.jar:4.4.15] at org.apache.http.impl.execchain.MainClientExec.execute(MainClientExec.java:272) ~[httpclient-4.5.13.jar:4.5.13] at org.apache.http.impl.execchain.ProtocolExec.execute(ProtocolExec.java:186) ~[httpclient-4.5.13.jar:4.5.13] at org.apache.http.impl.execchain.RetryExec.execute(RetryExec.java:89) ~[httpclient-4.5.13.jar:4.5.13] at org.apache.http.impl.execchain.RedirectExec.execute(RedirectExec.java:110) ~[httpclient-4.5.13.jar:4.5.13] at org.apache.http.impl.client.InternalHttpClient.doExecute(InternalHttpClient.java:185) ~[httpclient-4.5.13.jar:4.5.13] at org.apache.http.impl.client.CloseableHttpClient.execute(CloseableHttpClient.java:83) ~[httpclient-4.5.13.jar:4.5.13] at us.codecraft.webmagic.downloader.HttpClientDownloader.download(HttpClientDownloader.java:83) ~[webmagic-core-0.7.5.jar:na] at us.codecraft.webmagic.Spider.processRequest(Spider.java:419) ~[webmagic-core-0.7.5.jar:na] at us.codecraft.webmagic.Spider.access$000(Spider.java:61) ~[webmagic-core-0.7.5.jar:na] at us.codecraft.webmagic.Spider$1.run(Spider.java:322) ~[webmagic-core-0.7.5.jar:na] at us.codecraft.webmagic.thread.CountableThreadPool$1.run(CountableThreadPool.java:74) ~[webmagic-core-0.7.5.jar:na] at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128) ~[na:na] at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628) ~[na:na] at java.base/java.lang.Thread.run(Thread.java:830) ~[na:na]
只开一个线程,并且降低频率试试。