id: 12581    nodeId: 12581    type: General    point: 404.0    linkPoint: .0    maker: cella    permission: linkable    made at: 2019.04.30 12:41    edited at: 2019.10.14 01:59
ImageNet 1K data set
https://github.com/tensorflow/models/blob/master/research/inception/inception/data/download_and_preprocess_imagenet.sh
which is linked from
https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/Classification/RN50v1.5
which is linked from
https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html
But the other files are needed:
which are found in
https://github.com/tensorflow/models/tree/master/research/inception/inception/data
// I work in a directory:
$ pwd
/home/soh/Downloads

// I put those files in:
/home/soh/Downloads/download_and_preprocess_imagenet.sh
/home/soh/Downloads/download_and_preprocess_imagenet.sh.runfiles/inception/inception/build_imagenet_data.py
/home/soh/Downloads/download_and_preprocess_imagenet.sh.runfiles/inception/inception/data/download_imagenet.sh
/home/soh/Downloads/download_and_preprocess_imagenet.sh.runfiles/inception/inception/data/imagenet_lsvrc_2015_synsets.txt
/home/soh/Downloads/download_and_preprocess_imagenet.sh.runfiles/inception/inception/data/preprocess_imagenet_validation_data.py
/home/soh/Downloads/download_and_preprocess_imagenet.sh.runfiles/inception/inception/data/imagenet_2012_validation_synset_labels.txt
/home/soh/Downloads/download_and_preprocess_imagenet.sh.runfiles/inception/inception/data/imagenet_metadata.txt
/home/soh/Downloads/download_and_preprocess_imagenet.sh.runfiles/inception/inception/data/process_bounding_boxes.py

// in .py, change python to python3 since there is no python currently
// for all .sy and .py, chmod +x
// change /home/soh//works/tf/imagenet/download_and_preprocess_imagenet.sh as follows:
------------------------- file start
#!/bin/bash
# Copyright 2016 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

# Script to download and preprocess ImageNet Challenge 2012
# training and validation data set.
#
# The final output of this script are sharded TFRecord files containing
# serialized Example protocol buffers. See build_imagenet_data.py for
# details of how the Example protocol buffers contain the ImageNet data.
#
# The final output of this script appears as such:
#
# data_dir/train-00000-of-01024
# data_dir/train-00001-of-01024
# ...
# data_dir/train-01023-of-01024
#
# and
#
# data_dir/validation-00000-of-00128
# data_dir/validation-00001-of-00128
# ...
# data_dir/validation-00127-of-00128
#
# Note that this script may take several hours to run to completion. The
# conversion of the ImageNet data to TFRecords alone takes 2-3 hours depending
# on the speed of your machine. Please be patient.
#
# **IMPORTANT**
# To download the raw images, the user must create an account with image-net.org
# and generate a username and access_key. The latter two are required for
# downloading the raw images.
#
# usage:
# ./download_and_preprocess_imagenet.sh [data-dir]
set -e

if [ -z "$1" ]; then
echo "Usage: download_and_preprocess_imagenet.sh [data dir]"
exit
fi

# Create the output and temporary directories.
DATA_DIR="${1%/}"
SCRATCH_DIR="${DATA_DIR}/raw-data/"
mkdir -p "${DATA_DIR}"
mkdir -p "${SCRATCH_DIR}"
WORK_DIR="$0.runfiles/inception/inception"

# Download the ImageNet data.
LABELS_FILE="${WORK_DIR}/data/imagenet_lsvrc_2015_synsets.txt"
DOWNLOAD_SCRIPT="${WORK_DIR}/data/download_imagenet.sh"
"${DOWNLOAD_SCRIPT}" "${SCRATCH_DIR}" "${LABELS_FILE}"

# Note the locations of the train and validation data.
TRAIN_DIRECTORY="${SCRATCH_DIR}train/"
VALIDATION_DIRECTORY="${SCRATCH_DIR}validation/"

# Preprocess the validation data by moving the images into the appropriate
# sub-directory based on the label (synset) of the image.
echo "Organizing the validation data into sub-directories."
PREPROCESS_VAL_SCRIPT="${WORK_DIR}/data/preprocess_imagenet_validation_data.py"
VAL_LABELS_FILE="${WORK_DIR}/data/imagenet_2012_validation_synset_labels.txt"

"${PREPROCESS_VAL_SCRIPT}" "${VALIDATION_DIRECTORY}" "${VAL_LABELS_FILE}"

# Convert the XML files for bounding box annotations into a single CSV.
echo "Extracting bounding box information from XML."
BOUNDING_BOX_SCRIPT="${WORK_DIR}/data/process_bounding_boxes.py"
BOUNDING_BOX_FILE="${SCRATCH_DIR}/imagenet_2012_bounding_boxes.csv"
BOUNDING_BOX_DIR="${SCRATCH_DIR}bounding_boxes/"

"${BOUNDING_BOX_SCRIPT}" "${BOUNDING_BOX_DIR}" "${LABELS_FILE}" \
| sort > "${BOUNDING_BOX_FILE}"
echo "Finished downloading and preprocessing the ImageNet data."

# Build the TFRecords version of the ImageNet data.
#BUILD_SCRIPT="${WORK_DIR}/build_imagenet_data"
BUILD_SCRIPT="${WORK_DIR}/build_imagenet_data.py"
OUTPUT_DIRECTORY="${DATA_DIR}"
IMAGENET_METADATA_FILE="${WORK_DIR}/data/imagenet_metadata.txt"

"${BUILD_SCRIPT}" \
--train_directory="${TRAIN_DIRECTORY}" \
--validation_directory="${VALIDATION_DIRECTORY}" \
--output_directory="${OUTPUT_DIRECTORY}" \
--imagenet_metadata_file="${IMAGENET_METADATA_FILE}" \
--labels_file="${LABELS_FILE}" \
--bounding_box_file="${BOUNDING_BOX_FILE}"
------------------------- file end

// change /home/soh//works/tf/imagenet/download_and_preprocess_imagenet.sh.runfiles/inception/inception/data/download_imagenet.sh as follows:
------------------------- file start
#!/bin/bash
# Copyright 2016 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

# Script to download ImageNet Challenge 2012 training and validation data set.
#
# Downloads and decompresses raw images and bounding boxes.
#
# **IMPORTANT**
# To download the raw images, the user must create an account with image-net.org
# and generate a username and access_key. The latter two are required for
# downloading the raw images.
#
# usage:
# ./download_imagenet.sh [dir name] [synsets file]
set -e

#if [ "x$IMAGENET_ACCESS_KEY" == x -o "x$IMAGENET_USERNAME" == x ]; then
# cat <<END
#In order to download the imagenet data, you have to create an account with
#image-net.org. This will get you a username and an access key. You can set the
#IMAGENET_USERNAME and IMAGENET_ACCESS_KEY environment variables, or you can
#enter the credentials here.
#END
# read -p "Username: " IMAGENET_USERNAME
# read -p "Access key: " IMAGENET_ACCESS_KEY
#fi

OUTDIR="${1:-./imagenet-data}"
SYNSETS_FILE="${2:-./synsets.txt}"
#FILES_DIR="/home/soh/Downloads"
FILES_DIR="/ws"

echo "Saving downloaded files to $OUTDIR"
mkdir -p "${OUTDIR}"
INITIAL_DIR=$(pwd)
BBOX_DIR="${OUTDIR}bounding_boxes"
mkdir -p "${BBOX_DIR}"
cd "${OUTDIR}"

# Download and process all of the ImageNet bounding boxes.
BASE_URL="http://www.image-net.org/challenges/LSVRC/2012/nonpub"

# See here for details: http://www.image-net.org/download-bboxes
BOUNDING_BOX_ANNOTATIONS="${BASE_URL}/ILSVRC2012_bbox_train_v2.tar.gz"
#BBOX_TAR_BALL="${BBOX_DIR}/annotations.tar.gz"
BBOX_TAR_BALL="${FILES_DIR}/ILSVRC2012_bbox_train_v2.tar.gz"
#echo "Saving bounding box annotations to $BBOX_TAR_BALL"
#echo "Downloading bounding box annotations."
#wget "${BOUNDING_BOX_ANNOTATIONS}" -O "${BBOX_TAR_BALL}" || BASE_URL_CHANGE=1
#if [ $BASE_URL_CHANGE ]; then
# BASE_URL="http://www.image-net.org/challenges/LSVRC/2012/nnoupb"
# BOUNDING_BOX_ANNOTATIONS="${BASE_URL}/ILSVRC2012_bbox_train_v2.tar.gz"
# wget "${BOUNDING_BOX_ANNOTATIONS}" -O "${BBOX_TAR_BALL}"
#fi
echo "Uncompressing bounding box annotations ..."
tar xzf "${BBOX_TAR_BALL}" -C "${BBOX_DIR}"

LABELS_ANNOTATED="${BBOX_DIR}/*"
NUM_XML=$(ls -1 ${LABELS_ANNOTATED} | wc -l)
echo "Identified ${NUM_XML} bounding box annotations."

# Download and uncompress all images from the ImageNet 2012 validation dataset.
VALIDATION_TARBALL="ILSVRC2012_img_val.tar"
OUTPUT_PATH="${OUTDIR}validation/"
mkdir -p "${OUTPUT_PATH}"
cd "${OUTDIR}/.."
#echo "Downloading ${VALIDATION_TARBALL} to ${OUTPUT_PATH}."
#wget -nd -c "${BASE_URL}/${VALIDATION_TARBALL}"
echo "Extracting ${VALIDATION_TARBALL} to ${OUTPUT_PATH}."
#tar xf "${VALIDATION_TARBALL}" -C "${OUTPUT_PATH}"
tar xf "${FILES_DIR}/ILSVRC2012_img_val.tar" -C "${OUTPUT_PATH}"

# Download all images from the ImageNet 2012 train dataset.
TRAIN_TARBALL="ILSVRC2012_img_train.tar"
OUTPUT_PATH="${OUTDIR}train/"
mkdir -p "${OUTPUT_PATH}"
cd "${OUTDIR}/.."
#echo "Downloading ${TRAIN_TARBALL} to ${OUTPUT_PATH}."
#wget -nd -c "${BASE_URL}/${TRAIN_TARBALL}"

# Un-compress the individual tar-files within the train tar-file.
echo "Uncompressing individual train tar-balls in the training data."

while read SYNSET; do
echo "Processing: ${SYNSET}"

# Create a directory and delete anything there.
mkdir -p "${OUTPUT_PATH}/${SYNSET}"
rm -rf "${OUTPUT_PATH}/${SYNSET}/*"

# Uncompress into the directory.
# tar xf "${TRAIN_TARBALL}" "${SYNSET}.tar"
tar xf "${FILES_DIR}/ILSVRC2012_img_train.tar" "${SYNSET}.tar"
tar xf "${SYNSET}.tar" -C "${OUTPUT_PATH}/${SYNSET}/"
rm -f "${SYNSET}.tar"

echo "Finished processing: ${SYNSET}"
done < "${SYNSETS_FILE}"
#done < "${INITIAL_DIR}/${SYNSETS_FILE}"
------------------------- file end

// wget is not working. download tar or tar.gz with torrent
// search the torrents in http://academictorrents.com with filenames and download torrents
// downloaded files are
/home/soh/Downloads/ILSVRC2012_bbox_train_v2.tar.gz (about 20MB)
/home/soh/Downloads/ILSVRC2012_img_train.tar (about 148GB)
/home/soh/Downloads/ILSVRC2012_img_val.tar (about 6.7GB)

// now required files are ready:
// use absolute paths of sh and target data path
// numpy, tensorflow are used to preprocess => use docker container

$ pwd
/home/soh/Downloads

$ docker run --gpus all -it -v $PWD:/ws -w /ws tensorflow/tensorflow:1.14.0-gpu-py3 bash

// in docker container
/ws $ /ws/download_and_preprocess_imagenet.sh /ws/imagenet1k

Return to 진격의 거인 or ImageNet 1K data set