</p> if (!isSearched(searchedsite, url)) {; @; G- ?5 S6 w/ b" q4 l
if (isRobotAllowed(new URL(url)))//搜检该链接是否被许可搜索
. |& p% A+ U! M% h processHtml(url);/ F/ B% e& W7 o" ^3 j. j1 W
else
, I) B/ |3 I, I+ C+ r' ^$ i- L System.out.println("this page is disallowed to search");" A( \ t. i9 c" b( U' r% k( [4 @
}4 N! `* P7 T8 ] F C
} catch (Exception ex) {: q8 W8 u$ l& M
}
2 v+ H+ g- Z N, p @2 v, c- K) C- e queue.remove();" @5 Z) j, [5 A
}
/ T* J$ B5 a" f; l }
- L* @& U' W3 S8 t9 R+ j) h /**
6 ?) f' t1 F) F4 n *解析HTML6 \2 k/ A. c- v9 F A* k
* @param url& ^! p N7 p. p- M
* @throws ParserException
5 P, A) c3 Z# I e. A$ ? * @throws Exception
6 k* V% w7 }* f6 m */
8 s8 F* D: q H1 R, m! y public void processHtml(String url) throws ParserException, Exception { searchedsite.add(url);
& J2 n3 p' m3 P count = 0;/ P% r% E* t2 w- ~7 A
System.out.println("searching ... :" + url); parser.setURL(url); parser.setEncoding("GBK"); URLConnection uc = parser.getConnection(); uc.connect(); //uc.getLastModified(); NodeIterator nit = parser.elements();1 `/ h( m0 ~, i# `7 T, {7 J" S
while (nit.hasMoreNodes()) { Node node = nit.nextNode();! M- S' @6 l8 [/ q4 L3 I
parserNode(node);
2 }# x2 u0 V6 I2 ~ l7 G) l5 u( o }
. Y" L8 s1 G. V0 h, P srb.setKeywords(keyword); srb.setUrl(url); srb.setCount_key_words(count); resultlist.add(srb); System.out.println("count keywords is :" + count); System.out.println("----------------------------------------------");
N# a+ }- ]; D+ p }% [3 H c. t6 G. n2 f( s6 |
/**
0 l5 j( O" j1 e9 Q+ S6 [& ?* Q" f *措置HTML标签
. B* M# u5 I D0 O2 _# e5 p( Z. ? * @param tag
1 L+ }: `" U7 a& V- c * @throws Exception! K, h. z; i, @8 S3 h E
*/; d% Q/ W6 o* P6 s4 \6 o
public void dealTag(Tag tag) throws Exception {
, S1 Y/ |) G, ^' k" T" q NodeList list = tag.getChildren(); if (list != null) { NodeIterator it = list.elements(); while (it.hasMoreNodes()) { Node node = it.nextNode();4 E; n Q# |5 D, D' ~
parserNode(node);/ Y6 g* S0 A& m4 i" Y
}3 ~! H7 {, @* l; N1 c
}7 K9 f* t0 X5 x* `3 k
}5 p5 D9 }3 k/ h$ A5 }( r
/**
* X! J7 d% X2 v. D( L9 u. I *措置HTML标签结点' c9 Z) M: C; p$ W$ }
* @param node- L5 |; B8 d" _% p
* @throws Exception
' a: N; U1 }1 S */' y1 U0 `( V9 v) S7 l
public void parserNode(Node node) throws Exception{% i5 X- T* Z# u1 c/ C0 x
if (node instanceof StringNode) {//判定是否是文本结点
' Y% u" q5 m" n. I StringNode sNode = (StringNode) node;. p& c, q) b7 {: y# ]' O
StringFilter sf = new StringFilter(keyword,false); search_key_words = sf.accept(sNode);
9 n/ ?! P* G @3 P2 S% h if (search_key_words) {
6 I4 r# ~4 R2 } count++;
. f- n, r( d0 E, z6 U
$ n3 O5 t3 f# b } |