</p> *解析HTML
+ ~9 i$ R& g2 w* I: v3 }4 X1 ^ * @param url8 O( r1 |+ ]6 S9 o) t/ H# \2 m
* @throws ParserException: U7 e6 Q' k9 D- U, W& Z
* @throws Exception
/ c% Q$ D" X. |! `7 W! r# W$ q. x; s */
. z5 ?0 B- {4 j [ public void processHtml(String url) throws ParserException, Exception { searchedsite.add(url);: v1 a- {- F* k+ o+ v B7 e
count = 0;6 e) S' E# L- J% m9 Q9 j
System.out.println("searching ... :" + url); parser.setURL(url); parser.setEncoding("GBK"); URLConnection uc = parser.getConnection(); uc.connect(); //uc.getLastModified(); NodeIterator nit = parser.elements();: o5 c8 e+ ^% @# P) U7 u' v; o
while (nit.hasMoreNodes()) { Node node = nit.nextNode();) Y. q! B: x0 |/ l
parserNode(node);
: b5 }% q$ H/ f6 W( E. f }# U: ^) x: I$ y
srb.setKeywords(keyword); srb.setUrl(url); srb.setCount_key_words(count); resultlist.add(srb); System.out.println("count keywords is :" + count); System.out.println("----------------------------------------------");" I0 ?; ^2 p6 L3 c# G+ `! ~3 z
}: h: O$ I9 C6 A ~4 j+ b) _, T
/**
0 o5 T- l; z! Q- o- `* { *处理HTML标签
/ l; V0 x. F/ {! ]% c3 ^* Q * @param tag
3 s" _+ Q: \+ o, g+ b * @throws Exception
3 @% t% r& A& e1 F0 E v/ { */
; |1 S; G2 W- g) V9 J( \2 D public void dealTag(Tag tag) throws Exception {! _% M3 |4 P) \, h
NodeList list = tag.getChildren(); if (list != null) { NodeIterator it = list.elements(); while (it.hasMoreNodes()) { Node node = it.nextNode();
* D1 v" `5 h3 ~4 m ^+ V" @ parserNode(node);5 } [$ [; a8 I6 C* Y$ S5 q' E
}2 f0 K9 A% f+ ^" S3 _, Z
}6 R. {/ _9 `+ M( L
}
1 ^0 j8 t- z2 E+ @5 I /**
/ S" ]8 b6 y2 K I *处理HTML标签结点% c: k& P) `2 t: G% w% X
* @param node
M- X. Z; K. h* N( L * @throws Exception. ~- z% Z( J; M2 c
*/( \2 s% V7 F* h7 Y3 A( K
public void parserNode(Node node) throws Exception{
, o+ S6 v% x& Q& k5 t, T if (node instanceof StringNode) {//判断是否是文本结点/ q, J' u8 s5 R) F- L
StringNode sNode = (StringNode) node;
$ {3 J5 G9 T) u- Z {5 K StringFilter sf = new StringFilter(keyword,false); search_key_words = sf.accept(sNode);2 t9 T$ A, y: K# g
if (search_key_words) {! k# `. V; ? h, I
count++;
1 ^$ \! R. Y2 M( z3 S+ k }
& U1 |1 `; n7 E2 Z // System.out.println("text is :"+sNode.getText().trim());
7 a+ Q( x* M+ w* b2 w9 U! ` } else if (node instanceof Tag) {//判断是否是标签库结点
9 O6 g+ U+ N; Q: ~+ G Tag atag = (Tag) node;
6 f. N; ]3 v8 o, X+ X$ A if (atag instanceof TitleTag) {//判断是否是标TITLE结点0 u, A. a( G" v4 z; f
srb.setTitle(atag.getText());
/ A; U) j$ N* x( u' g; n/ z }% t. f1 t7 y2 H0 V4 Z6 Z
if (atag instanceof LinkTag) {//判断是否是标LINK结点
y, j* ~* `% E! U LinkTag linkatag = (LinkTag) atag;4 S0 z- j" ^$ G3 _- f: P" ? |
checkLink(linkatag.getLink(), linklist); // System.out.println("-----------------this is link --------------");3 K; [$ X1 `$ |8 L! }: \1 s
}
! p6 v- g# _9 s dealTag(atag);
% O! p; w) Z8 Z, [9 z* s/ G% X$ H } else if (node instanceof RemarkNode) {//判断是否是注释
2 A5 X& ] z: q! z& T" y* b // System.out.println("this is remark");* J9 e! S) ? E2 w' v5 U: I3 E
! H @2 a6 f9 [+ |6 K+ ~1 n } |