[工具代码]使用Java爬取b站弹幕文件

工具代码

Posted by 王维维 on September 12, 2016

“抛砖引玉”

简介

跳过废话,直接看正文

使用Java写的爬取b站弹幕文件的工具类,只支持www.bilibili.com/video/av号/这类网址,其他的不支持。


正文

  • Constants
public class Constants {
    public static final String BILIBILI_DANMU_FILE_PATH_PATTERN = "danmures/bilibili" + "/VIDEO_NAME.xml";
    
    public static final String BILIBILI_DANMU_URL_PATH_PATTERN = "http://comment.bilibili.tv/CID.xml";
}
  • getBilibiliDanmuFileByUrl
import java.io.IOException;
import java.io.InputStream;

import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.params.CoreConnectionPNames;

public class ClawUtil {
    public static String getBilibiliDanmuFileByUrl(final String url) {
        HttpClient sHttpClient = new DefaultHttpClient();
        String fileName = null;
        final HttpGet httpGet = new HttpGet(url);
        try {
            // send request
            httpGet.getParams().setParameter(CoreConnectionPNames.SO_TIMEOUT, 20000);
            final HttpResponse response = sHttpClient.execute(httpGet);
            final HttpEntity entity = response.getEntity();
    
            // unzip to get htmlString
            String htmlString = StringUtil.gzipInputStreamToUTF8String(entity.getContent());
    
            // get cid from htmlString
            String cid = "";
            {
                int startIndex = htmlString.indexOf("cid=") + 4;
                int endIndex = htmlString.indexOf("&", startIndex);
                cid = htmlString.substring(startIndex, endIndex);
            }
    
            //get video name from htmlString
            String videoName = "";
            {
                int startIndex = htmlString.indexOf("<title>") + 7;
                int endIndex = htmlString.indexOf("</title>", startIndex);
                videoName = StringUtil.removeSpecialChar(htmlString.substring(startIndex, endIndex));
            }
    
            fileName = Constants.BILIBILI_DANMU_FILE_PATH_PATTERN.replace("VIDEO_NAME", videoName);
            if (!FileUtil.isFileExist(fileName)) {
                String danMuUrl = Constants.BILIBILI_DANMU_URL_PATH_PATTERN.replace("CID", cid);
                HttpUtil.writeRequestEntityIntoFile(Constants.BILIBILI_DANMU_URL_PATH_PATTERN.replace("CID", cid), fileName);
            }
        } catch (final IOException e) {
            e.printStackTrace();
            return null;
        } finally {
            httpGet.abort();
        }
        return fileName;
    }
}
  • gzipInputStreamToUTF8String
public static String gzipInputStreamToUTF8String(InputStream is) {
	// unzip to get htmlString
    GZIPInputStream gZipIs;
    StringBuilder sb = new StringBuilder();
	try {
		gZipIs = new GZIPInputStream(is);
		InputStreamReader isr = new InputStreamReader(gZipIs, "utf-8"); // 设置读取流的编码格式,自定义编码
        BufferedReader br = new java.io.BufferedReader(isr);
        
        String tempbf;
        while ((tempbf = br.readLine()) != null) {
               sb.append(tempbf);
               sb.append("\r\n");
        }
        isr.close();
        gZipIs.close();
	} catch (IOException e) {
		e.printStackTrace();
	}
    return sb.toString();
}
  • removeSpecialChar
public static String removeSpecialChar(String str) {
	  String regEx="[ `~!@#$%^&*()+=|{}':;',\\[\\].<>/?~!@#¥%……&*()——+|{}【】‘;:”“’。,、?]";
      Pattern   p   =   Pattern.compile(regEx);     
      Matcher   m   =   p.matcher(str);
      return m.replaceAll("").trim();
}
  • writeRequestEntityIntoFile
public static boolean writeRequestEntityIntoFile(final String url, final String absFileName) {
	HttpClient httpClient = new DefaultHttpClient();
    final HttpGet httpGet = new HttpGet(url);
    return writeRequestEntityIntoFile(httpGet, httpClient, url, absFileName);
    }
  • writeRequestEntityIntoFile
private static boolean writeRequestEntityIntoFile(final HttpRequestBase requestBase,
        final HttpClient httpClient, final String url, final String absFileName) {
    try {
        requestBase.getParams().setParameter(CoreConnectionPNames.SO_TIMEOUT, WAIT_TIME_OUT);

        final HttpResponse response = httpClient.execute(requestBase);
        if (response.getStatusLine().getStatusCode() != HttpStatus.SC_OK) {
            return false;
        }

        HttpEntity entity = response.getEntity();
        if (entity != null && entity.getContentEncoding() != null) {
            System.out.println("contentType : " + entity.getContentType());
            if ("gzip".equalsIgnoreCase(entity.getContentEncoding().getValue())) {
                System.out.println("contentEncoding : gzip");
                return FileUtil.readInputStreamAndWriteToFile(absFileName, new GzipDecompressingEntity(entity).getContent());
            } else if ("deflate".equalsIgnoreCase(entity.getContentEncoding().getValue())) {
                System.out.println("contentEncoding : deflate");
                return FileUtil.readInputStreamAndWriteToFile(absFileName, new DeflateDecompressingEntity(entity).getContent());
            }
        } else {
            System.out.println("entity == null || entity.getContentEncoding() == null");
        }

    } catch (final ClientProtocolException e) {
        e.printStackTrace();
    } catch (final IOException e) {
    	e.printStackTrace();
    } finally {
        requestBase.abort();
    }
    return false;
}

后记

篇幅所限,这里没有列出全部代码,但关键代码已经全部列出,其他的一些小方法自己去实现就可以了。