#!/bin/ksh ############################################################# # Program: ftpspider.sh # Copyright 1999 # # Description: This program accepts a single URL from the # command line and then crawls or spiders the ftp # site to find all child directories associated # with it. This program returns a list of # directories as standard output. # # Author: Dana French (dfrench@aignetplex.com) # # (405) 936-2342 # # Date: 03/22/99 # ############################################################# # Modifications: # # 07/07/98 Version: 1.0 # Original Code ################################################################ syntax() { echo "" echo "No URL was specified" echo "" echo "Syntax: ftpspider.sh [-?][-v] \"ftp://some.domain.com/some/directory\" -v Verbose Mode " } ################################################################ spider() { while read PAGEURL PARENT do if [[ ${VERBOSE} -eq 1 ]] then echo "crawling: ${PAGEURL}" echo " parent: ${PARENT}" else echo "${PAGEURL}" fi VAR_LIST=`${CMD_LYNX} -dump "${PAGEURL}" \ | grep -i "Directory" \ | grep -iv "current Directory" \ | cut -d"]" -f2 \ | sed -e "s|^|${PAGEURL}/|g" \ | sort \ | uniq` # echo "${VAR_LIST}" if [[ "_${VAR_LIST}" != "_" ]] then if [[ ${VERBOSE} -eq 1 ]] then echo "${VAR_LIST}" | sed -e "s/^/ child: /g" fi echo "${VAR_LIST}" | sed -e "s|$| ${PAGEURL}|g" >> ${TMPLIST} fi echo "${PAGEURL} ${PARENT}" >> ${RUNLIST} done < ${WORKLIST} } ################################################################ CMD_LYNX="lynx" VERBOSE="0" case "_${1}" in "_-?" ) syntax; exit;; "_-v" ) VERBOSE="1"; shift;; esac PAGEURL="${1}" if [[ "_${PAGEURL}" = "_" ]] then syntax exit fi SITE=`echo "${PAGEURL}" | cut -d"/" -f1-3` TOPDIR=`echo "${PAGEURL}" | sed -e "s|${SITE}||g;s|^/||g;s|/$||g"` URLTOP="${SITE}/${TOPDIR}" if [[ "_${TOPDIR}" = "_" ]] then URLTOP="${SITE}" fi RUNLIST="/tmp/runlist${$}.tmp" WORKLIST="/tmp/worklist${$}.tmp" TMPLIST="/tmp/tmplist${$}.tmp" NEWLIST="/tmp/newlist${$}.tmp" RUNTMP="/tmp/runtmp${$}.tmp" WORKTMP="/tmp/worktmp${$}.tmp" NEWTMP="/tmp/newtmp${$}.tmp" rm -f ${RUNLIST} rm -f ${WORKLIST} rm -f ${TMPLIST} rm -f ${NEWLIST} rm -f ${RUNTMP} rm -f ${WORKTMP} rm -f ${NEWTMP} echo "${URLTOP}" > ${WORKLIST} LINES=`wc -l < ${WORKLIST}` while [ ${LINES} -gt 0 ] do spider sort ${TMPLIST} | uniq > ${NEWLIST} cp ${RUNLIST} ${TMPLIST} sort ${TMPLIST} | uniq > ${RUNLIST} cut -d" " -f1 < ${RUNLIST} | sort | uniq > ${RUNTMP} cut -d" " -f1 < ${NEWLIST} | sort | uniq > ${NEWTMP} comm -13 ${RUNTMP} ${NEWTMP} > ${WORKTMP} rm -f ${WORKLIST} touch ${WORKLIST} while read LINK do grep -i "^${LINK}" ${NEWLIST} >> ${WORKLIST} done < ${WORKTMP} rm -f ${RUNTMP} rm -f ${NEWTMP} rm -f ${WORKTMP} rm -f ${TMPLIST} rm -f ${NEWLIST} LINES=`wc -l < ${WORKLIST}` done rm -f ${RUNLIST} rm -f ${WORKLIST} rm -f ${TMPLIST} rm -f ${NEWLIST}