本文档全面介绍 Java 爬虫的核心原理、实现技术、反爬策略以及常见面试问题帮助您深入理解爬虫技术并应对相关面试。1. 爬虫概述1.1 什么是网络爬虫网络爬虫Web Crawler是一种按照一定规则自动抓取互联网信息的程序或脚本。核心流程textURL队列 → 下载页面 → 解析内容 → 提取数据 → 存储数据 → 发现新URL → 继续循环1.2 爬虫的分类类型特点应用场景通用爬虫大规模、全量抓取搜索引擎Google、百度聚焦爬虫特定领域、精准抓取价格监控、舆情分析增量式爬虫只抓取更新内容新闻聚合、数据同步深度爬虫抓取动态内容JavaScript 渲染页面2. 爬虫核心原理2.1 HTTP 请求原理java/** * HTTP 请求的本质 */ public class HttpRequestPrinciple { /** * 1. TCP 三次握手建立连接 * 2. 发送 HTTP 请求报文 * 3. 接收 HTTP 响应报文 * 4. TCP 四次挥手断开连接 */ public void demonstrate() { // 一个完整的 HTTP 请求示例 String request GET /api/data HTTP/1.1 Host: www.example.com User-Agent: Mozilla/5.0 Accept: text/html,application/json Cookie: sessionIdabc123 Connection: keep-alive ; // 服务器返回的响应 String response HTTP/1.1 200 OK Content-Type: application/json Content-Length: 123 {code:200,data:...} ; } }2.2 解析原理java/** * 页面解析的核心提取结构化数据 */ public class ParsePrinciple { // 1. 正则表达式 - 简单但脆弱 public void regexParse(String html) { Pattern pattern Pattern.compile(title(.*?)/title); Matcher matcher pattern.matcher(html); if (matcher.find()) { String title matcher.group(1); } } // 2. DOM 解析 - 构建树结构 public void domParse(String html) { Document doc Jsoup.parse(html); Elements links doc.select(a[href]); String title doc.title(); } // 3. XPath - 路径表达式 public void xpathParse(String html) { // //div[classcontent]/p/text() } // 4. CSS 选择器 - 类似 jQuery public void cssSelector(String html) { // div.content p:first-child } }3. 基础爬虫实现Jsoup3.1 引入依赖xmldependency groupIdorg.jsoup/groupId artifactIdjsoup/artifactId version1.16.1/version /dependency3.2 基础爬虫实现javaimport org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.util.ArrayList; import java.util.List; public class BasicCrawler { private static final Logger logger LoggerFactory.getLogger(BasicCrawler.class); /** * 抓取单个页面 */ public Document fetchPage(String url) throws IOException { // 设置请求头模拟浏览器 Document doc Jsoup.connect(url) .userAgent(Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36) .header(Accept, text/html,application/xhtmlxml,application/xml;q0.9,*/*;q0.8) .header(Accept-Language, zh-CN,zh;q0.8,en-US;q0.5,en;q0.3) .header(Accept-Encoding, gzip, deflate) .header(Connection, keep-alive) .timeout(10000) // 10秒超时 .get(); logger.info(成功抓取页面: {}, url); return doc; } /** * 解析页面数据 */ public Product parseProduct(Document doc, String url) { Product product new Product(); // 使用 CSS 选择器提取数据 product.setTitle(doc.select(h1.product-title).text()); product.setPrice(doc.select(span.price).text()); product.setDescription(doc.select(div.description).text()); // 提取图片链接 Elements images doc.select(img.product-image); ListString imageUrls new ArrayList(); for (Element img : images) { imageUrls.add(img.attr(abs:src)); // 获取绝对路径 } product.setImageUrls(imageUrls); // 提取页面中的所有链接 Elements links doc.select(a[href]); ListString linksUrls new ArrayList(); for (Element link : links) { String linkUrl link.attr(abs:href); if (isValidUrl(linkUrl)) { linksUrls.add(linkUrl); } } product.setRelatedLinks(linksUrls); return product; } private boolean isValidUrl(String url) { return url ! null !url.isEmpty() (url.startsWith(http://) || url.startsWith(https://)); } }3.3 多线程爬虫实现javaimport java.util.concurrent.*; import java.util.concurrent.atomic.AtomicInteger; public class MultiThreadCrawler { private final ExecutorService executor; private final BlockingQueueString urlQueue; private final AtomicInteger activeCount; private final SetString visitedUrls; public MultiThreadCrawler(int threadCount) { this.executor Executors.newFixedThreadPool(threadCount); this.urlQueue new LinkedBlockingQueue(); this.activeCount new AtomicInteger(0); this.visitedUrls ConcurrentHashMap.newKeySet(); } /** * 启动爬虫 */ public void start(String seedUrl) { urlQueue.offer(seedUrl); while (true) { try { String url urlQueue.poll(5, TimeUnit.SECONDS); if (url null activeCount.get() 0) { break; // 队列为空且没有活跃任务退出 } if (url ! null !visitedUrls.contains(url)) { visitedUrls.add(url); activeCount.incrementAndGet(); executor.submit(new CrawlTask(url)); } } catch (InterruptedException e) { Thread.currentThread().interrupt(); break; } } shutdown(); } /** * 爬取任务 */ private class CrawlTask implements Runnable { private final String url; public CrawlTask(String url) { this.url url; } Override public void run() { try { logger.info(开始爬取: {}, url); // 下载页面 Document doc Jsoup.connect(url) .userAgent(Mozilla/5.0) .timeout(10000) .get(); // 解析并提取数据 processPage(doc, url); // 提取新链接 Elements links doc.select(a[href]); for (Element link : links) { String newUrl link.attr(abs:href); if (shouldCrawl(newUrl)) { urlQueue.offer(newUrl); } } logger.info(完成爬取: {}, url); } catch (Exception e) { logger.error(爬取失败: {}, url, e); } finally { activeCount.decrementAndGet(); } } private void processPage(Document doc, String url) { // 具体的数据处理逻辑 } private boolean shouldCrawl(String url) { return url ! null url.startsWith(https://example.com) !visitedUrls.contains(url); } } public void shutdown() { executor.shutdown(); try { executor.awaitTermination(30, TimeUnit.SECONDS); } catch (InterruptedException e) { executor.shutdownNow(); } } }4. 高级爬虫实现HttpClient Jsoup4.1 引入依赖xmldependency groupIdorg.apache.httpcomponents/groupId artifactIdhttpclient/artifactId version4.5.14/version /dependency dependency groupIdorg.jsoup/groupId artifactIdjsoup/artifactId version1.16.1/version /dependency4.2 HttpClient 配置javaimport org.apache.http.HttpHost; import org.apache.http.client.config.RequestConfig; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.impl.conn.PoolingHttpClientConnectionManager; import org.springframework.stereotype.Component; Component public class HttpClientFactory { private CloseableHttpClient httpClient; public CloseableHttpClient createHttpClient() { if (httpClient null) { // 连接池配置 PoolingHttpClientConnectionManager cm new PoolingHttpClientConnectionManager(); cm.setMaxTotal(200); // 最大连接数 cm.setDefaultMaxPerRoute(50); // 每个路由最大连接数 // 请求配置 RequestConfig requestConfig RequestConfig.custom() .setConnectTimeout(5000) // 连接超时 .setSocketTimeout(10000) // 读取超时 .setConnectionRequestTimeout(3000) // 从连接池获取连接超时 .setRedirectsEnabled(true) // 自动重定向 .build(); httpClient HttpClients.custom() .setConnectionManager(cm) .setDefaultRequestConfig(requestConfig) .setUserAgent(Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36) .build(); } return httpClient; } }4.3 高级爬虫实现javaimport org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.util.EntityUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.nio.charset.StandardCharsets; Component public class AdvancedCrawler { private static final Logger logger LoggerFactory.getLogger(AdvancedCrawler.class); Autowired private HttpClientFactory httpClientFactory; /** * 带重试机制的爬取 */ public Document fetchWithRetry(String url, int maxRetries) { for (int i 0; i maxRetries; i) { try { return fetch(url); } catch (Exception e) { logger.warn(第{}次爬取失败: {}, i 1, url, e); if (i maxRetries - 1) { throw new RuntimeException(爬取失败已重试 maxRetries 次, e); } try { Thread.sleep(1000L * (i 1)); // 指数退避 } catch (InterruptedException ie) { Thread.currentThread().interrupt(); break; } } } return null; } /** * 基础爬取方法 */ private Document fetch(String url) throws IOException { CloseableHttpClient httpClient httpClientFactory.createHttpClient(); HttpGet httpGet new HttpGet(url); // 设置请求头 httpGet.setHeader(Accept, text/html,application/xhtmlxml,application/xml;q0.9,image/webp,*/*;q0.8); httpGet.setHeader(Accept-Language, zh-CN,zh;q0.9); httpGet.setHeader(Cache-Control, no-cache); httpGet.setHeader(Connection, keep-alive); try (CloseableHttpResponse response httpClient.execute(httpGet)) { int statusCode response.getStatusLine().getStatusCode(); if (statusCode 200) { String html EntityUtils.toString(response.getEntity(), StandardCharsets.UTF_8); return Jsoup.parse(html, url); } else if (statusCode 403) { logger.error(访问被拒绝: {}, url); throw new IOException(HTTP 403 Forbidden); } else if (statusCode 404) { logger.error(页面不存在: {}, url); throw new IOException(HTTP 404 Not Found); } else { logger.warn(HTTP状态码异常: {} - {}, statusCode, url); throw new IOException(HTTP statusCode); } } } /** * 使用代理IP */ public Document fetchWithProxy(String url, String proxyHost, int proxyPort) throws IOException { CloseableHttpClient httpClient HttpClients.custom() .setProxy(new HttpHost(proxyHost, proxyPort)) .build(); HttpGet httpGet new HttpGet(url); try (CloseableHttpResponse response httpClient.execute(httpGet)) { String html EntityUtils.toString(response.getEntity(), StandardCharsets.UTF_8); return Jsoup.parse(html, url); } finally { httpClient.close(); } } }5. 动态页面爬取Selenium WebDriver5.1 引入依赖xmldependency groupIdorg.seleniumhq.selenium/groupId artifactIdselenium-java/artifactId version4.15.0/version /dependency5.2 Selenium 配置javaimport org.openqa.selenium.WebDriver; import org.openqa.selenium.chrome.ChromeDriver; import org.openqa.selenium.chrome.ChromeOptions; import org.openqa.selenium.support.ui.WebDriverWait; import org.springframework.stereotype.Component; import java.time.Duration; import java.util.concurrent.TimeUnit; Component public class SeleniumDriverFactory { public WebDriver createChromeDriver() { System.setProperty(webdriver.chrome.driver, /path/to/chromedriver); ChromeOptions options new ChromeOptions(); // 无头模式不显示浏览器界面 options.addArguments(--headless); // 禁用GPU加速 options.addArguments(--disable-gpu); // 设置窗口大小 options.addArguments(--window-size1920,1080); // 禁用沙箱 options.addArguments(--no-sandbox); // 禁用图片加载提高速度 options.addArguments(--blink-settingsimagesEnabledfalse); // 设置用户代理 options.addArguments(--user-agentMozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36); WebDriver driver new ChromeDriver(options); // 设置隐式等待 driver.manage().timeouts().implicitlyWait(10, TimeUnit.SECONDS); // 设置页面加载超时 driver.manage().timeouts().pageLoadTimeout(30, TimeUnit.SECONDS); return driver; } }5.3 动态内容爬取javaimport org.openqa.selenium.By; import org.openqa.selenium.JavascriptExecutor; import org.openqa.selenium.WebDriver; import org.openqa.selenium.WebElement; import org.openqa.selenium.support.ui.ExpectedConditions; import org.openqa.selenium.support.ui.WebDriverWait; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Service; import java.time.Duration; import java.util.List; import java.util.stream.Collectors; Service public class DynamicCrawler { Autowired private SeleniumDriverFactory driverFactory; /** * 爬取Ajax动态加载的页面 */ public ListString crawlInfiniteScroll(String url) { WebDriver driver driverFactory.createChromeDriver(); WebDriverWait wait new WebDriverWait(driver, Duration.ofSeconds(30)); try { driver.get(url); // 等待页面加载 Thread.sleep(3000); // 滚动加载更多内容 JavascriptExecutor js (JavascriptExecutor) driver; long lastHeight (long) js.executeScript(return document.body.scrollHeight); while (true) { // 滚动到底部 js.executeScript(window.scrollTo(0, document.body.scrollHeight);); Thread.sleep(2000); // 等待新内容加载 long newHeight (long) js.executeScript(return document.body.scrollHeight); if (newHeight lastHeight) { break; } lastHeight newHeight; } // 提取数据 ListWebElement items driver.findElements(By.cssSelector(.item)); return items.stream() .map(WebElement::getText) .collect(Collectors.toList()); } catch (Exception e) { throw new RuntimeException(动态页面爬取失败, e); } finally { driver.quit(); } } /** * 等待元素出现后爬取 */ public String waitAndCrawl(String url, String selector) { WebDriver driver driverFactory.createChromeDriver(); WebDriverWait wait new WebDriverWait(driver, Duration.ofSeconds(10)); try { driver.get(url); // 等待元素出现 WebElement element wait.until( ExpectedConditions.presenceOfElementLocated(By.cssSelector(selector)) ); return element.getText(); } finally { driver.quit(); } } /** * 处理验证码示例调用打码平台 */ public void handleCaptcha(WebDriver driver, String captchaImageSelector) { WebElement captchaImage driver.findElement(By.cssSelector(captchaImageSelector)); String imageUrl captchaImage.getAttribute(src); // 调用打码平台API String captchaCode callCaptchaAPI(imageUrl); // 输入验证码 WebElement input driver.findElement(By.cssSelector(#captcha-input)); input.sendKeys(captchaCode); } private String callCaptchaAPI(String imageUrl) { // 调用第三方打码平台 return 1234; } }6. 反爬策略与应对6.1 常见反爬手段反爬手段原理应对策略IP封禁检测IP访问频率代理IP池、延迟请求User-Agent检测检查请求头随机User-AgentCookie/Session验证需要登录态模拟登录、保存Cookie验证码图形/滑块验证打码平台、OCR识别动态Token请求需要动态令牌逆向分析JS、模拟生成字体反爬使用自定义字体解析字体映射数据加密响应数据加密JS逆向解密IP限流频率限制分布式爬取、调整频率6.2 反爬应对实现javaComponent public class AntiAntiCrawler { /** * 1. 随机 User-Agent */ private static final ListString USER_AGENTS Arrays.asList( Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36, Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36, Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36, Mozilla/5.0 (iPhone; CPU iPhone OS 14_0 like Mac OS X) AppleWebKit/605.1.15 ); public String getRandomUserAgent() { return USER_AGENTS.get(new Random().nextInt(USER_AGENTS.size())); } /** * 2. 随机延迟 */ public void randomDelay() { try { int delay 1000 new Random().nextInt(3000); Thread.sleep(delay); } catch (InterruptedException e) { Thread.currentThread().interrupt(); } } /** * 3. 代理IP池 */ private BlockingQueueProxy proxyPool new LinkedBlockingQueue(); public Proxy getRandomProxy() { ListProxy proxies new ArrayList(proxyPool); if (proxies.isEmpty()) { return null; } return proxies.get(new Random().nextInt(proxies.size())); } /** * 4. Cookie管理 */ private BasicCookieStore cookieStore new BasicCookieStore(); public void saveCookie(HttpHost target, Cookie cookie) { cookieStore.addCookie(cookie); } public ListCookie getCookies() { return cookieStore.getCookies(); } /** * 5. 模拟浏览器行为 */ public CloseableHttpClient createBrowserLikeClient() { // 配置SSL SSLContext sslContext SSLContextBuilder.create() .loadTrustMaterial(TrustAllStrategy.INSTANCE) .build(); SSLConnectionSocketFactory sslSocketFactory new SSLConnectionSocketFactory( sslContext, NoopHostnameVerifier.INSTANCE); // 配置Cookie存储 CookieStore cookieStore new BasicCookieStore(); // 构建HttpClient return HttpClients.custom() .setSSLContext(sslContext) .setSSLHostnameVerifier(NoopHostnameVerifier.INSTANCE) .setDefaultCookieStore(cookieStore) .setUserAgent(getRandomUserAgent()) .setDefaultRequestConfig(RequestConfig.custom() .setCookieSpec(CookieSpecs.STANDARD) .build()) .build(); } /** * 6. 验证码识别OCR */ public String recognizeCaptcha(BufferedImage image) { // 使用Tesseract OCR Tesseract tesseract new Tesseract(); tesseract.setDatapath(/usr/share/tesseract-ocr/tessdata); try { return tesseract.doOCR(image).trim(); } catch (TesseractException e) { logger.error(验证码识别失败, e); return ; } } }6.3 代理IP池实现javaComponent public class ProxyPool { private final SetString proxySet ConcurrentHashMap.newKeySet(); private final QueueProxy availableProxy new ConcurrentLinkedQueue(); /** * 从免费代理网站获取代理 */ Scheduled(fixedDelay 300000) // 每5分钟更新一次 public void updateProxyPool() { ListString proxyUrls Arrays.asList( https://www.kuaidaili.com/free/, https://proxy.scrapeops.io/ ); for (String url : proxyUrls) { try { Document doc Jsoup.connect(url).get(); Elements rows doc.select(table tbody tr); for (Element row : rows) { String ip row.select(td:eq(0)).text(); String port row.select(td:eq(1)).text(); String proxy ip : port; if (!proxySet.contains(proxy) testProxy(ip, Integer.parseInt(port))) { proxySet.add(proxy); availableProxy.offer(new HttpHost(ip, Integer.parseInt(port))); logger.info(添加代理: {}, proxy); } } } catch (Exception e) { logger.error(获取代理失败, e); } } // 清理无效代理 cleanInvalidProxies(); } /** * 测试代理是否可用 */ private boolean testProxy(String ip, int port) { RequestConfig config RequestConfig.custom() .setProxy(new HttpHost(ip, port)) .setConnectTimeout(3000) .setSocketTimeout(3000) .build(); try (CloseableHttpClient client HttpClients.custom() .setDefaultRequestConfig(config) .build()) { HttpGet httpGet new HttpGet(http://www.baidu.com); try (CloseableHttpResponse response client.execute(httpGet)) { return response.getStatusLine().getStatusCode() 200; } } catch (Exception e) { return false; } } /** * 获取可用代理 */ public HttpHost getProxy() { return availableProxy.poll(); } /** * 清理无效代理 */ private void cleanInvalidProxies() { availableProxy.removeIf(proxy - !testProxy(proxy.getHostName(), proxy.getPort())); } }7. 数据存储7.1 存储策略javaComponent public class DataStorage { Autowired private JdbcTemplate jdbcTemplate; Autowired private MongoTemplate mongoTemplate; Autowired private RedisTemplateString, Object redisTemplate; /** * 存储到关系型数据库 */ public void saveToMySQL(Product product) { String sql INSERT INTO product (title, price, description, url, create_time) VALUES (?, ?, ?, ?, ?) ON DUPLICATE KEY UPDATE price VALUES(price), update_time NOW() ; jdbcTemplate.update(sql, product.getTitle(), product.getPrice(), product.getDescription(), product.getUrl(), new Date()); } /** * 存储到MongoDB适合非结构化数据 */ public void saveToMongoDB(Product product) { Document doc new Document() .append(title, product.getTitle()) .append(price, product.getPrice()) .append(description, product.getDescription()) .append(url, product.getUrl()) .append(crawlTime, new Date()); mongoTemplate.insert(doc, products); } /** * 存储到Redis适合缓存、去重 */ public void saveToRedis(String key, Object value) { redisTemplate.opsForValue().set(key, value, 1, TimeUnit.HOURS); } /** * URL去重使用Redis Set */ public boolean isUrlVisited(String url) { Boolean added redisTemplate.opsForSet().add(visited_urls, url); return added ! null !added; // 如果已存在返回false } }8. 分布式爬虫架构8.1 架构设计text┌─────────────────────────────────────────────────────────┐ │ 调度中心 │ │ (Redis / Kafka) │ └─────────────────────────────────────────────────────────┘ ↓ ┌──────────────┬──────────────┬──────────────┐ │ 爬虫节点1 │ 爬虫节点2 │ 爬虫节点3 │ │ (Consumer) │ (Consumer) │ (Consumer) │ └──────────────┴──────────────┴──────────────┘ ↓ ┌─────────────────────────────────────────────────┐ │ 数据存储 │ │ MySQL / MongoDB / Elasticsearch │ └─────────────────────────────────────────────────┘8.2 基于Redis的分布式爬虫javaComponent public class DistributedCrawler { Autowired private RedisTemplateString, String redisTemplate; private static final String URL_QUEUE_KEY crawler:url_queue; private static final String URL_VISITED_KEY crawler:visited; /** * 生产者添加URL到队列 */ public void addUrls(ListString urls) { for (String url : urls) { // 去重检查 if (!isVisited(url)) { redisTemplate.opsForList().rightPush(URL_QUEUE_KEY, url); } } } /** * 消费者获取URL */ public String getUrl() { return redisTemplate.opsForList().leftPop(URL_QUEUE_KEY, 5, TimeUnit.SECONDS); } /** * 标记URL为已访问 */ public void markVisited(String url) { redisTemplate.opsForSet().add(URL_VISITED_KEY, url); } /** * 检查URL是否已访问 */ public boolean isVisited(String url) { Boolean exists redisTemplate.opsForSet().isMember(URL_VISITED_KEY, url); return Boolean.TRUE.equals(exists); } /** * 消费者线程 */ Async public void startConsumer() { while (true) { String url getUrl(); if (url null) { try { Thread.sleep(1000); continue; } catch (InterruptedException e) { break; } } try { // 爬取页面 Document doc Jsoup.connect(url).get(); processPage(doc, url); markVisited(url); } catch (Exception e) { logger.error(爬取失败: {}, url, e); // 失败重试 redisTemplate.opsForList().rightPush(URL_QUEUE_KEY, url); } } } }9. 面试高频问题9.1 基础问题Q1: 请介绍一下Java爬虫的实现原理java/** * 核心原理 * 1. HTTP协议模拟使用HttpClient/OkHttp模拟浏览器发送请求 * 2. 页面解析使用Jsoup/XPath/正则表达式提取数据 * 3. URL管理队列去重实现广度优先/深度优先遍历 * 4. 数据存储保存到数据库/文件 */Q2: Jsoup和HttpClient的区别对比项JsoupHttpClient定位HTML解析库HTTP客户端功能解析、提取、DOM操作发送HTTP请求、连接管理使用场景页面解析请求发送性能较慢需解析DOM较快原生HTTP最佳实践HttpClient发送请求 Jsoup解析内容Q3: 如何处理动态页面java// 方案1: 使用Selenium模拟浏览器 WebDriver driver new ChromeDriver(); driver.get(url); // 方案2: 分析Ajax接口直接调用 String apiUrl https://api.example.com/data; String data HttpClient.get(apiUrl); // 方案3: 使用无头浏览器 ChromeOptions options new ChromeOptions(); options.addArguments(--headless);9.2 反爬问题Q4: 遇到反爬怎么办java// 1. 更换User-Agent // 2. 使用代理IP // 3. 添加随机延迟 // 4. 模拟登录 // 5. 处理验证码 // 6. 分布式爬取 // 7. 解析JS生成的数据Q5: 如何绕过IP封禁javaComponent public class IPBypass { // 方案1: 代理IP池 private QueueProxy proxyPool; // 方案2: 使用Tor网络 // 方案3: 分布式部署在不同服务器 public HttpHost getProxy() { return proxyPool.poll(); } // 方案4: 降低请求频率 public void slowDown() { Thread.sleep(1000 new Random().nextInt(2000)); } }Q6: 验证码如何解决java// 1. OCR识别简单验证码 Tesseract tesseract new Tesseract(); String code tesseract.doOCR(captchaImage); // 2. 机器学习识别 // 3. 第三方打码平台 String result DamaClient.solve(captchaImage); // 4. 人工打码小规模 // 5. 模拟浏览器保留Cookie9.3 性能优化Q7: 如何提高爬虫效率java// 1. 多线程/线程池 ExecutorService executor Executors.newFixedThreadPool(10); // 2. 连接池复用 PoolingHttpClientConnectionManager cm new PoolingHttpClientConnectionManager(); cm.setMaxTotal(200); // 3. 异步IO使用HttpAsyncClient CloseableHttpAsyncClient client HttpAsyncClients.createDefault(); client.start(); // 4. 分布式部署 // 5. 增量爬取只爬取更新内容 // 6. 使用缓存避免重复请求Q8: 如何避免重复爬取javaComponent public class DuplicateChecker { // 1. BloomFilter内存友好 private BloomFilterString bloomFilter BloomFilter.create( Funnels.stringFunnel(Charset.forName(UTF-8)), 1000000, // 预期元素数量 0.01 // 误判率 ); // 2. Redis Set分布式 public boolean isDuplicate(String url) { return redisTemplate.opsForSet().add(visited_urls, url) 0; } // 3. URL标准化去除参数 public String normalizeUrl(String url) { // 去除无关参数 // 统一大小写 // 处理相对路径 return url; } }9.4 扩展问题Q9: 如何处理登录后才能访问的页面javaComponent public class LoginCrawler { /** * 模拟登录 */ public CloseableHttpClient login(String username, String password) { CloseableHttpClient client HttpClients.createDefault(); // 1. 获取登录页面提取CSRF token HttpGet get new HttpGet(https://example.com/login); String html EntityUtils.toString(client.execute(get).getEntity()); String token extractCsrfToken(html); // 2. 提交登录表单 HttpPost post new HttpPost(https://example.com/login); ListNameValuePair params new ArrayList(); params.add(new BasicNameValuePair(username, username)); params.add(new BasicNameValuePair(password, password)); params.add(new BasicNameValuePair(csrf_token, token)); post.setEntity(new UrlEncodedFormEntity(params)); // 3. 保存Cookie CookieStore cookieStore new BasicCookieStore(); client.execute(post); // 返回带Cookie的客户端 return HttpClients.custom() .set