DelHtmlUtil.java 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140
  1. package com.shawn.util;
  2. import lombok.extern.slf4j.Slf4j;
  3. import org.elasticsearch.search.suggest.completion.RegexOptions;
  4. import java.util.regex.Matcher;
  5. import java.util.regex.Pattern;
  6. @Slf4j
  7. public class DelHtmlUtil {
  8. private static final String regEx_pre = "<pre[^>]*?>[\\s\\S]*?<\\/pre>"; // 定义pre的正则表达式
  9. private static final String regEx_script = "<script[^>]*?>[\\s\\S]*?<\\/script>"; // 定义script的正则表达式
  10. private static final String regEx_style = "<style[^>]*?>[\\s\\S]*?<\\/style>"; // 定义style的正则表达式
  11. private static final String regEx_html = "<[^>]+>"; // 定义HTML标签的正则表达式
  12. private static final String regEx_space = "\\s*|\t|\r|\n";//定义空格回车换行符
  13. private static final String regEx_img_src = "<img.*src\\s*=\\s*(.*?)[^>]*?>";//定义img的正则表达式
  14. /**
  15. * @param htmlStr
  16. * @return 删除Html标签
  17. */
  18. public static String delHTMLTag(String htmlStr) {
  19. Pattern p_script = Pattern.compile(regEx_script, Pattern.CASE_INSENSITIVE);
  20. Matcher m_script = p_script.matcher(htmlStr);
  21. htmlStr = m_script.replaceAll(""); // 过滤script标签
  22. Pattern p_style = Pattern.compile(regEx_style, Pattern.CASE_INSENSITIVE);
  23. Matcher m_style = p_style.matcher(htmlStr);
  24. htmlStr = m_style.replaceAll(""); // 过滤style标签
  25. Pattern p_pre = Pattern.compile(regEx_pre, Pattern.CASE_INSENSITIVE);
  26. Matcher m_pre = p_pre.matcher(htmlStr);
  27. htmlStr = m_pre.replaceAll(""); // 过滤pre标签
  28. Pattern p_html = Pattern.compile(regEx_html, Pattern.CASE_INSENSITIVE);
  29. Matcher m_html = p_html.matcher(htmlStr);
  30. htmlStr = m_html.replaceAll(""); // 过滤html标签
  31. // Pattern p_space = Pattern.compile(regEx_space, Pattern.CASE_INSENSITIVE);
  32. // Matcher m_space = p_space.matcher(htmlStr);
  33. // htmlStr = m_space.replaceAll(""); // 过滤空格回车标签
  34. return htmlStr.trim(); // 返回文本字符串
  35. }
  36. /**
  37. * 获取 第一个 img 的src
  38. * @param htmlStr
  39. * @return
  40. */
  41. public static String getFirstImgSrc(String htmlStr){
  42. Pattern p_script = Pattern.compile(regEx_img_src, Pattern.CASE_INSENSITIVE);
  43. Matcher m_image = p_script.matcher(htmlStr);
  44. while (m_image.find()) {
  45. // 得到<img />数据
  46. String img = m_image.group();
  47. // 匹配<img>中的src数据
  48. Matcher m = Pattern.compile("src\\s*=\\s*\"?(.*?)(\"|>|\\s+)").matcher(img);
  49. while (m.find()) {
  50. return m.group(1);
  51. }
  52. }
  53. return "";
  54. }
  55. public static String getTextFromHtml(String htmlStr) {
  56. htmlStr = delHTMLTag(htmlStr);
  57. htmlStr = htmlStr.replaceAll(" ", "");
  58. return htmlStr;
  59. }
  60. // public static void main(String[] args) {
  61. // String str = "ubuntu 安装 phpmyadmin 两种 (两者选一):\n" +
  62. // "<h4>1: apt-get 安装 然后使用 已有的虚拟主机目录建立软连接</h4>\n" +
  63. // "<div class=\"dp-highlighter\">\n" +
  64. // "<ol class=\"dp-xml\" start=\"1\">\n" +
  65. // " \t<li class=\"alt\">sudo apt-get install phpmyadmin</li>\n" +
  66. // " \t<li class=\"\">sudo ln-s /usr/share/phpmyadmin/ /var/www/pma</li>\n" +
  67. // "</ol>\n" +
  68. // "</div>\n" +
  69. // "<h4>2:手动上传</h4>\n" +
  70. // "网上下载 phpmyadmin软件包,使用 filezilla 上传到 /var/www/pma (pma自己创建)\n" +
  71. // "\n" +
  72. // "使用 ip/pma 查看 phpmyadmin\n" +
  73. // "\n" +
  74. // "其实 还可以 考虑给phpmyadmin 配置虚拟主机\n" +
  75. // "\n" +
  76. // "接下来 配置MySQL的 remote access\n" +
  77. // "\n" +
  78. // "默认下 mysql只能是本机访问的 但是 如果我通过ip 远程 访问方式 就是 remote access 比如 我在其他机器上要使用navicate 访问 也属于 remote access\n" +
  79. // "\n" +
  80. // "但是 ubuntu中的mysql 默认是不允许的 所以要修改mysql的配置\n" +
  81. // "\n" +
  82. // "sudo vim /etc/mysql/my.cnf #修改 bind-address 白名单 取消掉\n" +
  83. // "\n" +
  84. // "<img src=\"https://img-blog.csdn.net/20140914221520765?watermark/2/text/aHR0cDovL2Jsb2cuY3Nkbi5uZXQvd2FuZzc5NDY4NjcxNA==/font/5a6L5L2T/fontsize/400/fill/I0JBQkFCMA==/dissolve/70/gravity/SouthEast\" alt=\"\" />\n" +
  85. // "<h4></h4>\n" +
  86. // "<h4>进入phpmyadmin</h4>\n" +
  87. // "查看 用户 添加一个用户 可以 用 公网的ip 访问:\n" +
  88. // "\n" +
  89. // "<img src=\"https://img-blog.csdn.net/20140914222446977?watermark/2/text/aHR0cDovL2Jsb2cuY3Nkbi5uZXQvd2FuZzc5NDY4NjcxNA==/font/5a6L5L2T/fontsize/400/fill/I0JBQkFCMA==/dissolve/70/gravity/SouthEast\" alt=\"\" />\n" +
  90. // "\n" +
  91. // "<img src=\"https://img-blog.csdn.net/20140914222456821?watermark/2/text/aHR0cDovL2Jsb2cuY3Nkbi5uZXQvd2FuZzc5NDY4NjcxNA==/font/5a6L5L2T/fontsize/400/fill/I0JBQkFCMA==/dissolve/70/gravity/SouthEast\" alt=\"\" />\n" +
  92. // "<h4>重启msyql</h4>\n" +
  93. // "<div class=\"dp-highlighter\">\n" +
  94. // "<ol class=\"dp-xml\" start=\"1\">\n" +
  95. // " \t<li class=\"alt\">sudo service mysql restart</li>\n" +
  96. // "</ol>\n" +
  97. // "</div>\n" +
  98. // "使用navicate测试远程登录:\n" +
  99. // "\n" +
  100. // "<img src=\"https://img-blog.csdn.net/20140914222659905?watermark/2/text/aHR0cDovL2Jsb2cuY3Nkbi5uZXQvd2FuZzc5NDY4NjcxNA==/font/5a6L5L2T/fontsize/400/fill/I0JBQkFCMA==/dissolve/70/gravity/SouthEast\" alt=\"\" />\n" +
  101. // "\n" +
  102. // "<img src=\"https://img-blog.csdn.net/20140914222803896?watermark/2/text/aHR0cDovL2Jsb2cuY3Nkbi5uZXQvd2FuZzc5NDY4NjcxNA==/font/5a6L5L2T/fontsize/400/fill/I0JBQkFCMA==/dissolve/70/gravity/SouthEast\" alt=\"\" />\n" +
  103. // "\n" +
  104. // "以上 远程登录 已经 完成\n" +
  105. // "\n" +
  106. // "&nbsp;\n" +
  107. // "\n" +
  108. // "更多文章\n" +
  109. // "\n" +
  110. // "<a href=\"http://liuyanzhao.com/2009.html\" target=\"_blank\" rel=\"noopener noreferrer\">ubuntu14.04 下 mysql 存储目录迁移</a>\n" +
  111. // "\n" +
  112. // "<a href=\"http://liuyanzhao.com/1978.html\" target=\"_blank\" rel=\"noopener noreferrer\">ubuntu14.04下配置apache虚拟主机</a>\n" +
  113. // "\n" +
  114. // "<a href=\"http://liuyanzhao.com/2447.html\" target=\"_blank\" rel=\"noopener noreferrer\">ubuntu14.04 安装phpmyadmin 和配置</a>\n" +
  115. // "\n" +
  116. // "&nbsp;\n" +
  117. // "\n" +
  118. // "&nbsp;\n" +
  119. // "\n" +
  120. // "本文地址:<a href=\"http://liuyanzhao.com/2447.html\" target=\"_blank\" rel=\"noopener noreferrer\">http://liuyanzhao.com/2447.html</a>\n" +
  121. // "\n" +
  122. // "转载请注明\n" +
  123. // "\n" +
  124. // "&nbsp;";
  125. // System.out.println(getTextFromHtml(str));
  126. // System.out.println(getFirstImgSrc(str));
  127. // }
  128. }