功能需求
将html字符串保存为.mhtml文件
代码实现
- pom.xml依赖
<dependencies>
<dependency>
<groupid>org.springframework.boot</groupid>
<artifactid>spring-boot-starter-web</artifactid>
</dependency>
<!-- https://mvnrepository.com/artifact/cn.hutool/hutool-all -->
<dependency>
<groupid>cn.hutool</groupid>
<artifactid>hutool-all</artifactid>
<version>5.8.43</version>
</dependency>
<!-- jsoup:解析html标签、提取图片/样式资源,必备 -->
<dependency>
<groupid>org.jsoup</groupid>
<artifactid>jsoup</artifactid>
<version>1.17.2</version>
</dependency>
<!-- apache工具包:base64编码图片资源、io流处理,必备 -->
<!-- source: https://mvnrepository.com/artifact/commons-codec/commons-codec -->
<dependency>
<groupid>commons-codec</groupid>
<artifactid>commons-codec</artifactid>
<version>1.15</version>
<scope>compile</scope>
</dependency>
<!-- source: https://mvnrepository.com/artifact/commons-io/commons-io -->
<dependency>
<groupid>commons-io</groupid>
<artifactid>commons-io</artifactid>
<version>2.15.1</version>
<scope>compile</scope>
</dependency>
<!-- source: https://mvnrepository.com/artifact/org.projectlombok/lombok -->
<dependency>
<groupid>org.projectlombok</groupid>
<artifactid>lombok</artifactid>
<version>1.18.38</version>
<scope>compile</scope>
</dependency>
</dependencies>- 获取通过访问url获取html字符串内容工具类
@slf4j
public class wikiutils {
/**
* 获取wiki 页面html
*/
public static string getconfluencepagehtml(string url,string cookie) {
string value = "";
httpresponse httpresponse = httpclient.httpgetresponse(url, cookie);
if (httpresponse.isok()){
value = httpresponse.body();
}else if (httpresponse.getstatus() == 403|| httpresponse.getstatus() == 302){
log.error("无效的cookie,无权限访问");
}else {
log.error("获取html页面失败");
}
return value;
}
/**
* 在请求头中放入cookie,避免登录拦截
*/
public static httpresponse httpgetresponse(string url,string cookie) {
map<string, string> headers = new hashmap<>();
headers.put("cookie", cookie);
//登录
httpresponse response = httprequest.get(url).headermap(headers, true).execute();
return response;
}
}- html转换.mhtml核心类
@slf4j
public class html2mhtcompiler {
public static string parsetittle(string html) {
document doc = jsoup.parse(html);
element titleelement = doc.selectfirst("title");
if (titleelement != null) {
string text = titleelement.text();
int i = text.indexof("-");
if (i > 0) {
return text.substring(0, i).trim();
}
return text.trim();
}
return null;
}
// 原资源url -> 资源的base64编码(带mime头)
public static map<string, string> parsehtmlpage(string cookie,string html, string baseurl) {
map<string, string> resourcemap = new hashmap<>();
document doc = jsoup.parse(html);
// ========== 1. 提取所有 img 图片资源 ==========
elements imgelements = doc.select("img[src]");
for (element imgelement : imgelements) {
string imgsrc = imgelement.attr("src");
parseresource(cookie,imgsrc,"image",baseurl, resourcemap);
}
// ========== 2. 提取所有 link 外链css样式表资源==========
elements csselements = doc.select("link[rel=stylesheet][href]");
for (element csselement : csselements) {
string csshref = csselement.attr("href");
parseresource(cookie,csshref, "css",baseurl, resourcemap);
}
// ========== 3. 提取所有 script 外链js脚本资源 ==========
elements jselements = doc.select("script[src]");
for (element jselement : jselements) {
string jssrc = jselement.attr("src");
parseresource(cookie,jssrc,"javascript",baseurl, resourcemap);
}
return resourcemap;
}
// ========== 删除部分元素class="acs-side-bar ia-scrollable-section" 、
// class="ia-splitter-left"、
// id="header"
// id="navigation"
// id="likes-and-labels-container"、
// id="footer" 、
// id="comments-section"
// id="page-metadata-banner"
// id="breadcrumb-section"
// 、id="main"的style="margin-left: 285px;" ==========
public static string removeunwantedelements(string html) {
document doc = jsoup.parse(html);
//删除head标签下的style标签的属性中的.ia-splitter-left #main 这两个选择器
removecssselectorfromstyletag(doc, ".ia-splitter-left");
removecssselectorfromstyletag(doc, "#main");
// 1. 删除指定class的元素 → 侧边栏/左侧面板 等冗余区域
doc.select(".acs-side-bar .ia-scrollable-section").remove();
doc.select(".ia-splitter-left").remove();
// 2. 删除指定id的元素 → 点赞标签区、页脚、评论区 等无用模块
// doc.getelementbyid("likes-and-labels-container").remove();
doc.getelementbyid("footer").remove();
doc.getelementbyid("header").remove();
doc.getelementbyid("navigation").remove();
doc.getelementbyid("comments-section").remove();
doc.getelementbyid("page-metadata-banner").remove();
doc.getelementbyid("breadcrumb-section").remove();
// 3. 精准移除 id="main" 标签中【指定的style样式:margin-left: 285px;】,保留其他style样式
element mainelement = doc.getelementbyid("main");
if (mainelement != null && mainelement.hasattr("style")) {
// 获取原style属性值
string oldstyle = mainelement.attr("style");
// 移除指定的样式段,保留其他样式
string newstyle = oldstyle.replace("margin-left: 285px;", "").trim();
// 处理移除后style为空的情况,避免残留空的style=""属性
if (newstyle.isempty()) {
mainelement.removeattr("style");
} else {
mainelement.attr("style", newstyle);
}
}
return doc.html();
}
/**
* 核心工具方法:删除<head>标签下所有<style>标签内的【指定css选择器】及其对应的所有样式
* @param doc jsoup解析后的文档对象
* @param selector 要删除的css选择器,如:.ia-splitter-left 、 #main
*/
private static void removecssselectorfromstyletag(document doc, string selector) {
// 1. 获取head标签下所有的style样式标签
elements styletags = doc.head().select("style");
if (styletags.isempty()) {
return; // 没有style标签,直接返回
}
// 2. 遍历每一个style标签,处理内部的css内容
for (element styletag : styletags) {
string csscontent = styletag.html();
if (csscontent.isempty()) continue;
// 3. 精准匹配【选择器 { 任意样式内容 }】 完整块,含换行/空格/制表符,匹配规则全覆盖
// 匹配规则:匹配 .ia-splitter-left { ... } 或 #main { ... } 完整的样式块
string regex = selector + "\\s*\\{[^}]*\\}";
// 替换匹配到的内容为空,即删除该选择器及对应样式
string newcsscontent = csscontent.replaceall(regex, "").trim();
// 处理替换后多余的空行/空格,让css内容更整洁
newcsscontent = newcsscontent.replaceall("\\n+", "\n").replaceall("\\s+", " ");
// 4. 将处理后的css内容重新写入style标签
styletag.html(newcsscontent);
}
}
// ========== 图片/css/js都复用这个方法 ==========
private static void parseresource(string cookie,string resourcesrc,string resourcetype,string baseurl, map<string, string> resourcemap) {
try {
// 拼接完整url(兼容:绝对路径/相对路径)
string fullresourceurl = getfullurl(baseurl, resourcesrc);
// 下载资源文件,转成【带mime头的base64编码】
string base64resource = downloadresourcetobase64(fullresourceurl,resourcetype, cookie);
resourcemap.put(resourcesrc, base64resource);
} catch (exception e) {
log.error("资源解析失败,跳过该资源:" + resourcesrc, e);
}
}
// 拼接完整url:处理相对路径/绝对路径 (原有方法,复用)
private static string getfullurl(string baseurl, string src) {
if (src.startswith("http://") || src.startswith("https://")) {
return src; // 绝对路径,直接返回
} else if(src.startswith("//")){
return "https:" + src; // 兼容 //xxx.com/xxx.css 这种无协议路径
} else {
return src.startswith("/") ? baseurl + src : baseurl + "/" + src; // 相对路径,拼接根路径
}
}
// ========== 通用资源下载+base64编码方法,支持【图片/css/js】所有类型 ==========
private static string downloadresourcetobase64(string resourceurl,string resourcetype,string cookie) throws exception {
url url = new url(resourceurl);
httpurlconnection conn = (httpurlconnection) url.openconnection();
conn.setconnecttimeout(5000);
conn.setreadtimeout(5000);
conn.setrequestmethod("get");
conn.setrequestproperty("cookie",cookie);
// 解决部分网站的反爬/跨域问题
conn.setrequestproperty("user-agent", "mozilla/5.0 (windows nt 10.0; win64; x64) chrome/120.0.0.0");
conn.setrequestproperty("connection", "keep-alive");
conn.setrequestproperty("accept", "*/*");
if (resourcetype.equals("image")){
conn.setrequestproperty("accept-encoding", "gzip, deflate");
}
if (conn.getresponsecode() == 200) {
inputstream in = conn.getinputstream();
bytearrayoutputstream out = new bytearrayoutputstream();
byte[] buffer = new byte[1024];
int len;
while ((len = in.read(buffer)) != -1) {
out.write(buffer, 0, len);
}
byte[] resourcebytes = out.tobytearray();
// 对图片类型做【体积压缩+无损渲染】处理
if ("image".equalsignorecase(resourcetype) && resourcebytes.length > 0) {
resourcebytes = compressimage(resourcebytes, 0.7f); // 0.7是压缩质量,可调整
}
// 获取资源的mime类型 + base64编码,自动适配图片/css/js
string mimetype = conn.getcontenttype();
string base64 = base64.encodebase64string(resourcebytes);
in.close();
out.close();
conn.disconnect();
// 返回标准的data-url格式,可直接嵌入html替换原url
return "data:" + mimetype + ";base64," + base64;
}
return null;
}
/**
* 核心图片压缩工具方法:图片质量压缩(核心无坑)
* @param imagebytes 原图字节流
* @param quality 压缩质量 0.1~1.0 ,推荐0.6~0.8 (数值越大越清晰,体积越大)
* @return 压缩后的图片字节流
*/
private static byte[] compressimage(byte[] imagebytes, float quality) throws exception {
// 质量值兜底,防止传参错误
if (quality < 0.1f) quality = 0.1f;
if (quality > 1.0f) quality = 1.0f;
bytearrayinputstream bais = new bytearrayinputstream(imagebytes);
bufferedimage bufferedimage = imageio.read(bais);
if (bufferedimage == null) {
return imagebytes; // 非标准图片,返回原图
}
// 获取图片格式(png/jpg等)
string format = getimageformat(imagebytes);
if (format == null) {
format = "jpeg";
}
bytearrayoutputstream baos = new bytearrayoutputstream();
// 质量压缩,尺寸不变,清晰度无损,体积减小
imageio.write(bufferedimage, format, new memorycacheimageoutputstream(baos) {
@override
public void write(byte[] b, int off, int len) {
try {
super.write(b, off, len);
} catch (exception e) {
// 异常时直接写入原图,不影响
}
}
});
// 如果压缩后体积变大,返回原图
byte[] compressedbytes = baos.tobytearray();
bais.close();
baos.close();
return compressedbytes.length < imagebytes.length ? compressedbytes : imagebytes;
}
/**
* 获取图片真实格式
*/
private static string getimageformat(byte[] imagebytes) throws exception {
bytearrayinputstream bais = new bytearrayinputstream(imagebytes);
imageinputstream iis = imageio.createimageinputstream(bais);
iterator<imagereader> readers = imageio.getimagereaders(iis);
if (readers.hasnext()) {
imagereader reader = readers.next();
string format = reader.getformatname();
iis.close();
bais.close();
return format;
}
iis.close();
bais.close();
return null;
}
public static string embedresources(string html, map<string, string> resources) {
string embeddedhtml = html;
// 遍历所有资源,替换原url为base64编码
for (map.entry<string, string> entry : resources.entryset()) {
string resourceurl = entry.getkey();
string resourceurlescape = resourceurl.replace("&", "&");
string embeddedurl = entry.getvalue();
embeddedhtml = embeddedhtml.replace(resourceurlescape, embeddedurl);
}
return embeddedhtml;
}
public static void saveasmhtml(string html, string filepath) {
try (bufferedwriter writer = new bufferedwriter(
new outputstreamwriter(new fileoutputstream(filepath), standardcharsets.utf_8)
)) {
// 写入mhtml标准协议头
writer.write("mime-version: 1.0");
writer.newline();
writer.write("content-type: multipart/related; boundary=\"boundary\"");
writer.newline();
writer.newline();
// 写入内容边界开始标识
writer.write("--boundary");
writer.newline();
writer.write("content-type: text/html; charset=utf-8");
writer.newline();
writer.newline();
// 写入核心的、已嵌入所有资源的html内容
writer.write(html);
writer.newline();
writer.newline();
// 写入mhtml结束边界标识(必须写,否则文件格式不完整)
writer.write("--boundary--");
writer.flush();
}catch (ioexception e){
log.error("保存mhtml文件失败:" + filepath, e);
}
}逻辑调用:
- 通过url和cookie免密获取html字符串
- 获取html中的图片、css、js转成base64的字符串,因为.mhtml文件中超链接类型的样式无法渲染
- 删除html中不需要的布局和内容
- 使用2. 中获取的图片、css、js转成base64的字符串 替换html字符串中的超链接
- 保存为.mhtml文件
string html = wikiutils.getconfluencepagehtml(link, cookie);
if (html.isempty()){
log.error("获取html页面失败");
return;
}
map<string, string> htmlmap = html2mhtcompiler.parsehtmlpage(cookie, html, properties.baseurl);
string tittle = html2mhtcompiler.parsetittle(html);
string html2 = html2mhtcompiler.removeunwantedelements(html);
string parsehtml = html2mhtcompiler.embedresources(html2, htmlmap);
html2mhtcompiler.saveasmhtml(parsehtml, currentdir+file.separator + tittle + ".mhtml");到此这篇关于java实现html保存为.mhtml文件的代码逻辑的文章就介绍到这了,更多相关java html保存为.mhtml文件内容请搜索代码网以前的文章或继续浏览下面的相关文章希望大家以后多多支持代码网!
发表评论