#!/bin/ksh ############################################################# # Program: kshSpider.sh # Copyright 1998 # # Description: This program accepts a single URL from the # command line and then crawls or spiders the # URL to find all imbedded local links in all # child pages. This program returns a list # of local links as standard output. # # Author: Dana French (dfrench@aig.vialink.com) # # (405) 936-2342 # # Date: 07/07/98 # ############################################################# # Modifications: # # 07/07/98 Version: 1.0 # Original Code ################################################################ syntax() { echo "" echo "No URL was specified" echo "" echo "Syntax: kshspider.sh [-?][-v] \"http://some.domain.com/index.html\" -v Verbose Mode " } ################################################################ spider() { while read PAGEURL PARENT do if [[ ${VERBOSE} -eq 1 ]] then echo "crawling: ${PAGEURL}" echo " parent: ${PARENT}" else echo "${PAGEURL}" fi VAR_LIST=`${CMD_LYNX} -dump "${PAGEURL}" \ | grep -i "[0-9]. http:" \ | sed -e "s/.*http:/http:/g;s/#.*//g" \ | grep -i "^${URLTOP}" \ | grep -iv "^${URLTOP}/$" \ | sort \ | uniq` if [[ "_${VAR_LIST}" != "_" ]] then if [[ ${VERBOSE} -eq 1 ]] then echo "${VAR_LIST}" | sed -e "s/^/ child: /g" fi echo "${VAR_LIST}" | sed -e "s|$| ${PAGEURL}|g" >> ${TMPLIST} fi echo "${PAGEURL} ${PARENT}" >> ${RUNLIST} done < ${WORKLIST} } ################################################################ CMD_LYNX="lynx" VERBOSE="0" case "_${1}" in "_-?" ) syntax; exit;; "_-v" ) VERBOSE="1"; shift;; esac PAGEURL="${1}" if [[ "_${PAGEURL}" = "_" ]] then syntax exit fi SITE=`echo "${PAGEURL}" | cut -d"/" -f1-3` TOPDIR=`echo "${PAGEURL}" | sed -e "s|${SITE}||g;s|^/||g;s|/$||g"` URLTOP="${SITE}/${TOPDIR}" if [[ "_${TOPDIR}" = "_" ]] then URLTOP="${SITE}" fi RUNLIST="/tmp/runlist${$}.tmp" WORKLIST="/tmp/worklist${$}.tmp" TMPLIST="/tmp/tmplist${$}.tmp" NEWLIST="/tmp/newlist${$}.tmp" RUNTMP="/tmp/runtmp${$}.tmp" WORKTMP="/tmp/worktmp${$}.tmp" NEWTMP="/tmp/newtmp${$}.tmp" rm -f ${RUNLIST} rm -f ${WORKLIST} rm -f ${TMPLIST} rm -f ${NEWLIST} rm -f ${RUNTMP} rm -f ${WORKTMP} rm -f ${NEWTMP} echo "${URLTOP}" > ${WORKLIST} LINES=`wc -l < ${WORKLIST}` while [ ${LINES} -gt 0 ] do spider sort ${TMPLIST} | uniq > ${NEWLIST} cp ${RUNLIST} ${TMPLIST} sort ${TMPLIST} | uniq > ${RUNLIST} cut -d" " -f1 < ${RUNLIST} | sort | uniq > ${RUNTMP} cut -d" " -f1 < ${NEWLIST} | sort | uniq > ${NEWTMP} comm -13 ${RUNTMP} ${NEWTMP} > ${WORKTMP} rm -f ${WORKLIST} touch ${WORKLIST} while read LINK do grep -i "^${LINK}" ${NEWLIST} >> ${WORKLIST} done < ${WORKTMP} rm -f ${RUNTMP} rm -f ${NEWTMP} rm -f ${WORKTMP} rm -f ${TMPLIST} rm -f ${NEWLIST} LINES=`wc -l < ${WORKLIST}` done rm -f ${RUNLIST} rm -f ${WORKLIST} rm -f ${TMPLIST} rm -f ${NEWLIST}