| kshWeb |
kshSpider Web Crawler |
|
|   |
|
Business Card File Data Entry System |
Korn Shell Books |
Free E-mail
YourName@ UnixGuru.zzn.com YourName@ UnixWizard.zzn.com YourName@ MCSE.zzn.com ... and many others!!! |
This script performs WWW crawling or spidering, which means that it will to go a specified URL and crawl through the web site detecting other pages and links. The output of this program is a list of URL's.
The character based WWW browser Lynx is required to run this application and must be installed first.
Cut and paste the following script into a file called "kshspider.ksh" on your system. Or click on kshspider.ksh to download a file containing the function.
#!/bin/ksh
#############################################################
# Program: kshSpider.sh
# Copyright 1998
#
# Description: This program accepts a single URL from the
# command line and then crawls or spiders the
# URL to find all imbedded local links in all
# child pages. This program returns a list
# of local links as standard output.
#
# Author: Dana French (dfrench@aig.vialink.com)
#
# (405) 936-2342
#
# Date: 07/07/98
#
#############################################################
# Modifications:
#
# 07/07/98 Version: 1.0
# Original Code
################################################################
syntax()
{
echo ""
echo "No URL was specified"
echo ""
echo "Syntax:
kshspider.sh [-?][-v] \"http://some.domain.com/index.html\"
-v Verbose Mode
"
}
################################################################
spider()
{
while read PAGEURL PARENT
do
if [[ ${VERBOSE} -eq 1 ]]
then
echo "crawling: ${PAGEURL}"
echo " parent: ${PARENT}"
else
echo "${PAGEURL}"
fi
VAR_LIST=`${CMD_LYNX} -dump "${PAGEURL}" \
| grep -i "[0-9]. http:" \
| sed -e "s/.*http:/http:/g;s/#.*//g" \
| grep -i "^${URLTOP}" \
| grep -iv "^${URLTOP}/$" \
| sort \
| uniq`
if [[ "_${VAR_LIST}" != "_" ]]
then
if [[ ${VERBOSE} -eq 1 ]]
then
echo "${VAR_LIST}" | sed -e "s/^/ child: /g"
fi
echo "${VAR_LIST}" | sed -e "s|$| ${PAGEURL}|g" >> ${TMPLIST}
fi
echo "${PAGEURL} ${PARENT}" >> ${RUNLIST}
done < ${WORKLIST}
}
################################################################
CMD_LYNX="lynx"
VERBOSE="0"
case "_${1}" in
"_-?" ) syntax; exit;;
"_-v" ) VERBOSE="1"; shift;;
esac
PAGEURL="${1}"
if [[ "_${PAGEURL}" = "_" ]]
then
syntax
exit
fi
SITE=`echo "${PAGEURL}" | cut -d"/" -f1-3`
TOPDIR=`echo "${PAGEURL}" | sed -e "s|${SITE}||g;s|^/||g;s|/$||g"`
URLTOP="${SITE}/${TOPDIR}"
if [[ "_${TOPDIR}" = "_" ]]
then
URLTOP="${SITE}"
fi
RUNLIST="/tmp/runlist${$}.tmp"
WORKLIST="/tmp/worklist${$}.tmp"
TMPLIST="/tmp/tmplist${$}.tmp"
NEWLIST="/tmp/newlist${$}.tmp"
RUNTMP="/tmp/runtmp${$}.tmp"
WORKTMP="/tmp/worktmp${$}.tmp"
NEWTMP="/tmp/newtmp${$}.tmp"
rm -f ${RUNLIST}
rm -f ${WORKLIST}
rm -f ${TMPLIST}
rm -f ${NEWLIST}
rm -f ${RUNTMP}
rm -f ${WORKTMP}
rm -f ${NEWTMP}
echo "${URLTOP}" > ${WORKLIST}
LINES=`wc -l < ${WORKLIST}`
while [ ${LINES} -gt 0 ]
do
spider
sort ${TMPLIST} | uniq > ${NEWLIST}
cp ${RUNLIST} ${TMPLIST}
sort ${TMPLIST} | uniq > ${RUNLIST}
cut -d" " -f1 < ${RUNLIST} | sort | uniq > ${RUNTMP}
cut -d" " -f1 < ${NEWLIST} | sort | uniq > ${NEWTMP}
comm -13 ${RUNTMP} ${NEWTMP} > ${WORKTMP}
rm -f ${WORKLIST}
touch ${WORKLIST}
while read LINK
do
grep -i "^${LINK}" ${NEWLIST} >> ${WORKLIST}
done < ${WORKTMP}
rm -f ${RUNTMP}
rm -f ${NEWTMP}
rm -f ${WORKTMP}
rm -f ${TMPLIST}
rm -f ${NEWLIST}
LINES=`wc -l < ${WORKLIST}`
done
rm -f ${RUNLIST}
rm -f ${WORKLIST}
rm -f ${TMPLIST}
rm -f ${NEWLIST}
################################################################
|
|
Business Card File Data Entry System |
Korn Shell Books |
Free E-mail
YourName@ UnixGuru.zzn.com YourName@ UnixWizard.zzn.com YourName@ MCSE.zzn.com ... and many others!!! |
|   |
For Information regarding this page, contact Dana French ( dfrench@mtxia.com )