web.csv is csv file containing urls and their titles + image urls as presented in html files

search

webcsv | grep -i "<search_query>"

search example, multiple words "download debian"

cat web.csv | grep -i "download" | grep -i "debian"

image search
(searchimg searches from web.csv, *tmp commands search from web-tmp.csv)

webcsv | webcsv_simg "<search_query>" | head -n 100 | webcsv_imgui

crawl

1
2
3
webcsv_crawl <start_url>
# additional arguments are passed to wget2, eg:
webcsv_crawl --header 'Cookie: your_session_cookie' "start_url" --max-threads 3

update web.csv

webcsv_parse

install

  1. add script to file: webcsv
  2. source webcsv
    

source code

function webcsv_crawl() {
  mkdir -p ~/tmp
  eval 'wget2 -P ~/tmp -H -r --filter-mime-type "text/html,text/plain" --protocol-directories '$(webcsv_argstr "$@")' -l 3'
}

function webcsv_argstr() {
  for i in $(seq 1 20); do
    val=$(eval 'echo -n '$(echo -n '$'$i'" "'))

    if [ ${#val} -lt 2 ]; then
      break
    fi

    if [ "${val:0:2}" == "--" ]; then 
      echo -n "$val"
    else
      eval 'echo -n \"'$(echo '$'$i'\"" "')
    fi
  done
}

function webcsv_parse() {
  sd=$PWD
  cd ~/tmp
  cp web.csv web-tmp.csv
  files=$(find . -type f | grep -vP "./web(-tmp|).csv")
  i=0
  len=$(echo -e "$files" | wc -l)
  echo -e "$files" | while read f; do
    let i++
    echo "processing $i/$len $f"
    protocol=$(echo "$f" | awk -F "/" '{ print $2 }')
    proto_offset=$(( ${#protocol} + 2 ))
    url=$(echo "$protocol://${f:$proto_offset}" | webcsv_encodeuri)
    images=$(cat "$f" | webcsv_parseimages)
    #title=$(cat "$f" | htmlq title -t | perl -pe "s/\n/ /g" | perl -pe "s/\"/”/g")
    title=$(cat "$f" | { input=$(cat); m=$(echo "$input" | grep -o '<title>.*</title>'); echo "${m:7:-8}"; })

    date=$(date +%s)
    echo '"'"$url"'";"'"$title"'";"'"$images"'";'"$date"'' >> web-tmp.csv
  done
  cat web-tmp.csv | sort | uniq > web.csv
  rm web-tmp.csv
  cd $sd
}

function webcsv() {
  cat ~/tmp/web.csv
}

function webcsv_encodeuri() {
  s=$(cat | urlencode -c '|"')
  echo ${s::-3}
}

function webcsv_simg() {
  grep -i "$1" | grep -oP "http[^ ]+\.(jpg|jpeg|png|webp|svg)" | sort | uniq
  #grep -i "$1" | grep -P "http[^ ]+\.(jpg|jpeg|png|webp|svg)" | sort | uniq | awk -F"|" '!_[$1]++ ' | awk -F"|" '!_[$2]++ '
  #| awk -F";" '{ print $1,$3 }'
}

function webcsv_imgui() {
  echo -e "$(cat <<EOF
    <!DOCTYPE html>
    <html>
    <body>
    <style>
    img {
      max-width: calc(33vw - 1em);
      max-height: 50vh;
    }
    body > div {
      position: relative;
      display: inline-block; 
    } 
    p {
      text-align: center; 
      position: fixed; 
      bottom: 0; 
      width: 100%; 
      margin: 0;
    } 
    p > a {
      display: inline-block; 
      margin: .5em; 
      background: #ddd; 
      padding: .3em;
    }
    div > a {
      position: absolute;
      border: 1em solid red;
      border-width: .3em 0 0 0;
      width: 50%;
      height: 100%;
      opacity: .1;
    }
    div > a:hover {
      opacity: 1;
      background: rgba(0, 0, 0, .1)
    }
    div > a:nth-child(2) {
      right: 0;
    }
    </style>
    <script>
    let index = 0
    let count = 20

    function render_gallery(){
      links = [...document.querySelectorAll("body > div")];
      links.forEach(link => link.style.display = "inline-block");
      [...links.slice(0, index), ...links.slice(index + count, 500)].forEach(link => {
        link.style.display = "none"
      });
      window.scrollTo(0, 0)
    }

    onload = function(){
      let nav = document.createElement("p")
      let prev = document.createElement("a")
      let next = document.createElement("a")
      prev.href = next.href = "#"
      prev.innerText = "< prev"
      next.innerText = "next >"

      prev.onclick = function(e){
        e.preventDefault()
        index = Math.max(0, index - 10)
        render_gallery()
      }
      next.onclick = function(e){
        e.preventDefault()
        index = Math.min(links.length - count, index + 10)
        render_gallery()
      }

      nav.append(prev)
      nav.append(next)
      document.body.append(nav)

      render_gallery()
    }
    </script>
EOF
  )" > ~/tmp/searchimg.html

  searchres=$(cat)
  echo $(echo -e "$searchres" | while read entry; do 
    parts=$(echo "$entry" | perl -pe "s/\|/\n/")
    link=$(echo "$parts" | head -n 1)
    img=$(echo "$parts" | tail -n 1)
    imgsource=$(webcsv | grep -F "$link" | awk -F";" '{ print $1 }' | head -n 1)
    echo '<div><a href="'"$link"'"></a><a class="link" href='"$imgsource"'></a><img src="'$img'"/></div>'
  done) >> ~/tmp/searchimg.html

  echo '</body></html>' >> ~/tmp/searchimg.html

  firefox ~/tmp/searchimg.html
}

function webcsv_parseimages() {
  links=$(cat | htmlq "a" | grep img | grep -oP "http[^ \"]+")
  c=$(echo "$links" | wc -l)
  for i in $(seq 1 2 $c); do
    link=$(echo -e "$links" | head -n $i | tail -n 1 | webcsv_encodeuri)
    img=$(echo -e "$links" | head -n $(( $i + 1 )) | tail -n 1 | webcsv_encodeuri)
    echo -n "$link|$img "
  done
}
Edit

Pub: 25 Oct 2023 19:18 UTC

Edit: 27 Oct 2023 14:17 UTC

Views: 122