$ bin/nutch mergesegs digidesign.com-segment -dir crawl-20080705112059/segments
$ bin/nutch readseg -dump `ls -d digidesign.com-segment/2008*` dump -nocontent -nofetch -nogenerate \
-noparse -noparsetext
$ mkdir seeds100
$ cat dump/dump \
| grep 'http://digidesign.com/index.cfm' \
| grep 'langid=100' \
| sed -e 's/^.*http:/http:/' -e 's/ .*$//' \
| sort -u \
> seeds100/seeds.txt
$ wc -l seeds100/seeds.txt
291 seeds100/seeds.txt
$ vi conf/crawl-urlfilter.txt
$ grep digidesing conf/crawl-urlfilter.txt
+^http://digidesign.com/index.cfm[?].*langid=100
$ nohup bin/nutch crawl seeds100 -depth 10 &
$ tail -f nohup.out
crawl started in: crawl-20080706115317
rootUrlDir = seeds100
threads = 10
depth = 10
Injector: starting
...
Fetcher: starting
Fetcher: segment: crawl-20080706115317/segments/20080706120142
Fetcher: threads: 10
fetching http://digidesign.com/index.cfm?navid=103&langid=100&
fetching http://digidesign.com/index.cfm?navid=3&langid=100&eid=
fetch of http://digidesign.com/index.cfm?navid=103&langid=100& failed with: Http code=500,\
url=http://digidesign.com/index.cfm?navid=103&langid=100&
fetch of http://digidesign.com/index.cfm?navid=3&langid=100&eid= failed with: Http code=500,\
url=http://digidesign.com/index.cfm?navid=3&langid=100&eid=
Fetcher: done
...
done merging
crawl finished: crawl-20080706115317
$ bin/nutch readseg -list -dir crawl-20080706115317/segments | sort
20080706115322 291 2008-07-06T11:53:32 2008-07-06T12:01:24 291 273
20080706120142 2 2008-07-06T12:01:46 2008-07-06T12:01:48 2 0
20080706120153 2 2008-07-06T12:01:57 2008-07-06T12:01:59 2 0
NAME GENERATED FETCHER START FETCHER END FETCHED PARSED
$ |