| kshWeb |
ftpSpider Web Crawler |
|
|   |
|
Business Card File Data Entry System |
Korn Shell Books |
Free E-mail
YourName@ UnixGuru.zzn.com YourName@ UnixWizard.zzn.com YourName@ MCSE.zzn.com ... and many others!!! |
This script performs FTP site crawling or spidering, which means that it will to go a specified URL and crawl through the ftp site detecting child directories. The output of this program is a list of FTP directories in URL format.
The character based WWW browser Lynx is required to run this application and must be installed first.
Cut and paste the following script into a file called "ftpspider.ksh" on your system. Or click on ftpspider.ksh to download a file containing the function.
#!/bin/ksh
#############################################################
# Program: ftpspider.sh
# Copyright 1999
#
# Description: This program accepts a single URL from the
# command line and then crawls or spiders the ftp
# site to find all child directories associated
# with it. This program returns a list of
# directories as standard output.
#
# Author: Dana French (dfrench@mtxia.com.com)
#
# (405) 936-2342
#
# Date: 03/22/99
#
#############################################################
# Modifications:
#
# 07/07/98 Version: 1.0
# Original Code
################################################################
syntax()
{
echo ""
echo "No URL was specified"
echo ""
echo "Syntax:
ftpspider.sh [-?][-v] \"ftp://some.domain.com/some/directory\"
-v Verbose Mode
"
}
################################################################
spider()
{
while read PAGEURL PARENT
do
if [[ ${VERBOSE} -eq 1 ]]
then
echo "crawling: ${PAGEURL}"
echo " parent: ${PARENT}"
else
echo "${PAGEURL}"
fi
VAR_LIST=`${CMD_LYNX} -dump "${PAGEURL}" \
| grep -i "Directory" \
| grep -iv "current Directory" \
| cut -d"]" -f2 \
| sed -e "s|^|${PAGEURL}/|g" \
| sort \
| uniq`
# echo "${VAR_LIST}"
if [[ "_${VAR_LIST}" != "_" ]]
then
if [[ ${VERBOSE} -eq 1 ]]
then
echo "${VAR_LIST}" | sed -e "s/^/ child: /g"
fi
echo "${VAR_LIST}" | sed -e "s|$| ${PAGEURL}|g" >> ${TMPLIST}
fi
echo "${PAGEURL} ${PARENT}" >> ${RUNLIST}
done < ${WORKLIST}
}
################################################################
CMD_LYNX="lynx"
VERBOSE="0"
case "_${1}" in
"_-?" ) syntax; exit;;
"_-v" ) VERBOSE="1"; shift;;
esac
PAGEURL="${1}"
if [[ "_${PAGEURL}" = "_" ]]
then
syntax
exit
fi
SITE=`echo "${PAGEURL}" | cut -d"/" -f1-3`
TOPDIR=`echo "${PAGEURL}" | sed -e "s|${SITE}||g;s|^/||g;s|/$||g"`
URLTOP="${SITE}/${TOPDIR}"
if [[ "_${TOPDIR}" = "_" ]]
then
URLTOP="${SITE}"
fi
RUNLIST="/tmp/runlist${$}.tmp"
WORKLIST="/tmp/worklist${$}.tmp"
TMPLIST="/tmp/tmplist${$}.tmp"
NEWLIST="/tmp/newlist${$}.tmp"
RUNTMP="/tmp/runtmp${$}.tmp"
WORKTMP="/tmp/worktmp${$}.tmp"
NEWTMP="/tmp/newtmp${$}.tmp"
rm -f ${RUNLIST}
rm -f ${WORKLIST}
rm -f ${TMPLIST}
rm -f ${NEWLIST}
rm -f ${RUNTMP}
rm -f ${WORKTMP}
rm -f ${NEWTMP}
echo "${URLTOP}" > ${WORKLIST}
LINES=`wc -l < ${WORKLIST}`
while [ ${LINES} -gt 0 ]
do
spider
sort ${TMPLIST} | uniq > ${NEWLIST}
cp ${RUNLIST} ${TMPLIST}
sort ${TMPLIST} | uniq > ${RUNLIST}
cut -d" " -f1 < ${RUNLIST} | sort | uniq > ${RUNTMP}
cut -d" " -f1 < ${NEWLIST} | sort | uniq > ${NEWTMP}
comm -13 ${RUNTMP} ${NEWTMP} > ${WORKTMP}
rm -f ${WORKLIST}
touch ${WORKLIST}
while read LINK
do
grep -i "^${LINK}" ${NEWLIST} >> ${WORKLIST}
done < ${WORKTMP}
rm -f ${RUNTMP}
rm -f ${NEWTMP}
rm -f ${WORKTMP}
rm -f ${TMPLIST}
rm -f ${NEWLIST}
LINES=`wc -l < ${WORKLIST}`
done
rm -f ${RUNLIST}
rm -f ${WORKLIST}
rm -f ${TMPLIST}
rm -f ${NEWLIST}
################################################################
|
|
Business Card File Data Entry System |
Korn Shell Books |
Free E-mail
YourName@ UnixGuru.zzn.com YourName@ UnixWizard.zzn.com YourName@ MCSE.zzn.com ... and many others!!! |
|   |
For Information regarding this page, contact Dana French ( dfrench@mtxia.com )