“抛砖引玉”
简介
使用Java写的爬取b站弹幕文件的工具类,只支持www.bilibili.com/video/av号/这类网址,其他的不支持。
正文
- Constants
public class Constants {
public static final String BILIBILI_DANMU_FILE_PATH_PATTERN = "danmures/bilibili" + "/VIDEO_NAME.xml";
public static final String BILIBILI_DANMU_URL_PATH_PATTERN = "http://comment.bilibili.tv/CID.xml";
}
- getBilibiliDanmuFileByUrl
import java.io.IOException;
import java.io.InputStream;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.params.CoreConnectionPNames;
public class ClawUtil {
public static String getBilibiliDanmuFileByUrl(final String url) {
HttpClient sHttpClient = new DefaultHttpClient();
String fileName = null;
final HttpGet httpGet = new HttpGet(url);
try {
// send request
httpGet.getParams().setParameter(CoreConnectionPNames.SO_TIMEOUT, 20000);
final HttpResponse response = sHttpClient.execute(httpGet);
final HttpEntity entity = response.getEntity();
// unzip to get htmlString
String htmlString = StringUtil.gzipInputStreamToUTF8String(entity.getContent());
// get cid from htmlString
String cid = "";
{
int startIndex = htmlString.indexOf("cid=") + 4;
int endIndex = htmlString.indexOf("&", startIndex);
cid = htmlString.substring(startIndex, endIndex);
}
//get video name from htmlString
String videoName = "";
{
int startIndex = htmlString.indexOf("<title>") + 7;
int endIndex = htmlString.indexOf("</title>", startIndex);
videoName = StringUtil.removeSpecialChar(htmlString.substring(startIndex, endIndex));
}
fileName = Constants.BILIBILI_DANMU_FILE_PATH_PATTERN.replace("VIDEO_NAME", videoName);
if (!FileUtil.isFileExist(fileName)) {
String danMuUrl = Constants.BILIBILI_DANMU_URL_PATH_PATTERN.replace("CID", cid);
HttpUtil.writeRequestEntityIntoFile(Constants.BILIBILI_DANMU_URL_PATH_PATTERN.replace("CID", cid), fileName);
}
} catch (final IOException e) {
e.printStackTrace();
return null;
} finally {
httpGet.abort();
}
return fileName;
}
}
- gzipInputStreamToUTF8String
public static String gzipInputStreamToUTF8String(InputStream is) {
// unzip to get htmlString
GZIPInputStream gZipIs;
StringBuilder sb = new StringBuilder();
try {
gZipIs = new GZIPInputStream(is);
InputStreamReader isr = new InputStreamReader(gZipIs, "utf-8"); // 设置读取流的编码格式,自定义编码
BufferedReader br = new java.io.BufferedReader(isr);
String tempbf;
while ((tempbf = br.readLine()) != null) {
sb.append(tempbf);
sb.append("\r\n");
}
isr.close();
gZipIs.close();
} catch (IOException e) {
e.printStackTrace();
}
return sb.toString();
}
- removeSpecialChar
public static String removeSpecialChar(String str) {
String regEx="[ `~!@#$%^&*()+=|{}':;',\\[\\].<>/?~!@#¥%……&*()——+|{}【】‘;:”“’。,、?]";
Pattern p = Pattern.compile(regEx);
Matcher m = p.matcher(str);
return m.replaceAll("").trim();
}
- writeRequestEntityIntoFile
public static boolean writeRequestEntityIntoFile(final String url, final String absFileName) {
HttpClient httpClient = new DefaultHttpClient();
final HttpGet httpGet = new HttpGet(url);
return writeRequestEntityIntoFile(httpGet, httpClient, url, absFileName);
}
- writeRequestEntityIntoFile
private static boolean writeRequestEntityIntoFile(final HttpRequestBase requestBase,
final HttpClient httpClient, final String url, final String absFileName) {
try {
requestBase.getParams().setParameter(CoreConnectionPNames.SO_TIMEOUT, WAIT_TIME_OUT);
final HttpResponse response = httpClient.execute(requestBase);
if (response.getStatusLine().getStatusCode() != HttpStatus.SC_OK) {
return false;
}
HttpEntity entity = response.getEntity();
if (entity != null && entity.getContentEncoding() != null) {
System.out.println("contentType : " + entity.getContentType());
if ("gzip".equalsIgnoreCase(entity.getContentEncoding().getValue())) {
System.out.println("contentEncoding : gzip");
return FileUtil.readInputStreamAndWriteToFile(absFileName, new GzipDecompressingEntity(entity).getContent());
} else if ("deflate".equalsIgnoreCase(entity.getContentEncoding().getValue())) {
System.out.println("contentEncoding : deflate");
return FileUtil.readInputStreamAndWriteToFile(absFileName, new DeflateDecompressingEntity(entity).getContent());
}
} else {
System.out.println("entity == null || entity.getContentEncoding() == null");
}
} catch (final ClientProtocolException e) {
e.printStackTrace();
} catch (final IOException e) {
e.printStackTrace();
} finally {
requestBase.abort();
}
return false;
}
后记
篇幅所限,这里没有列出全部代码,但关键代码已经全部列出,其他的一些小方法自己去实现就可以了。