llamaindex update + unpacking archives in data
This commit is contained in:
145
unzip_archives.sh
Executable file
145
unzip_archives.sh
Executable file
@@ -0,0 +1,145 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Script to recursively unzip archives in the data folder
|
||||
# Valid archives are extracted in place, then moved to data-unpacked-archives
|
||||
# Invalid/broken archives are moved to data-broken-archives
|
||||
|
||||
set -e # Exit on any error
|
||||
|
||||
DATA_DIR="./data"
|
||||
UNPACKED_DIR="./data-unpacked-archives"
|
||||
BROKEN_DIR="./data-broken-archives"
|
||||
|
||||
# Create destination directories if they don't exist
|
||||
mkdir -p "$UNPACKED_DIR"
|
||||
mkdir -p "$BROKEN_DIR"
|
||||
|
||||
# Find all zip files recursively in the data directory
|
||||
find "$DATA_DIR" -type f -name "*.zip" | while read -r archive; do
|
||||
echo "Processing: $archive"
|
||||
|
||||
# Check if the zip file is valid and not password protected
|
||||
if unzip -t "$archive" >/dev/null 2>&1; then
|
||||
echo " Archive is valid, extracting..."
|
||||
|
||||
# Extract the archive in the same directory where it's located
|
||||
ARCHIVE_DIR=$(dirname "$archive")
|
||||
unzip -o "$archive" -d "$ARCHIVE_DIR"
|
||||
|
||||
# Move the processed archive to the unpacked directory
|
||||
mv "$archive" "$UNPACKED_DIR/"
|
||||
echo " Successfully extracted and moved to $UNPACKED_DIR"
|
||||
else
|
||||
echo " Archive is invalid, password-protected, or in unsupported format"
|
||||
|
||||
# Move the broken archive to the broken directory
|
||||
mv "$archive" "$BROKEN_DIR/"
|
||||
echo " Moved to $BROKEN_DIR"
|
||||
fi
|
||||
done
|
||||
|
||||
# Also handle other common archive formats that might be present
|
||||
for ext in rar 7z tar.gz tar.xz tar.bz2 gz xz bz2 tar; do
|
||||
find "$DATA_DIR" -type f -name "*.$ext" | while read -r archive; do
|
||||
echo "Processing: $archive (non-zip format)"
|
||||
|
||||
case $ext in
|
||||
rar)
|
||||
if command -v unrar >/dev/null 2>&1; then
|
||||
if unrar l "$archive" >/dev/null 2>&1; then
|
||||
ARCHIVE_DIR=$(dirname "$archive")
|
||||
unrar x "$archive" "$ARCHIVE_DIR"/
|
||||
mv "$archive" "$UNPACKED_DIR/"
|
||||
echo " Successfully extracted RAR and moved to $UNPACKED_DIR"
|
||||
else
|
||||
mv "$archive" "$BROKEN_DIR/"
|
||||
echo " Could not process RAR, moved to $BROKEN_DIR"
|
||||
fi
|
||||
else
|
||||
mv "$archive" "$BROKEN_DIR/"
|
||||
echo " unrar not available, moved to $BROKEN_DIR"
|
||||
fi
|
||||
;;
|
||||
7z)
|
||||
if command -v 7z >/dev/null 2>&1; then
|
||||
if 7z l "$archive" >/dev/null 2>&1; then
|
||||
ARCHIVE_DIR=$(dirname "$archive")
|
||||
7z x "$archive" -o"$ARCHIVE_DIR"/
|
||||
mv "$archive" "$UNPACKED_DIR/"
|
||||
echo " Successfully extracted 7z and moved to $UNPACKED_DIR"
|
||||
else
|
||||
mv "$archive" "$BROKEN_DIR/"
|
||||
echo " Could not process 7z, moved to $BROKEN_DIR"
|
||||
fi
|
||||
else
|
||||
mv "$archive" "$BROKEN_DIR/"
|
||||
echo " 7z not available, moved to $BROKEN_DIR"
|
||||
fi
|
||||
;;
|
||||
tar.gz|tgz|gz)
|
||||
if gunzip -t "$archive" >/dev/null 2>&1 || tar -tzf "$archive" >/dev/null 2>&1; then
|
||||
ARCHIVE_DIR=$(dirname "$archive")
|
||||
if [[ "$ext" == "gz" ]]; then
|
||||
# For gz files, we need to decompress in place
|
||||
cp "$archive" "$ARCHIVE_DIR/"
|
||||
gzip -d "$ARCHIVE_DIR/$(basename "$archive")"
|
||||
else
|
||||
tar -xzf "$archive" -C "$ARCHIVE_DIR"/
|
||||
fi
|
||||
mv "$archive" "$UNPACKED_DIR/"
|
||||
echo " Successfully extracted $ext and moved to $UNPACKED_DIR"
|
||||
else
|
||||
mv "$archive" "$BROKEN_DIR/"
|
||||
echo " Could not process $ext, moved to $BROKEN_DIR"
|
||||
fi
|
||||
;;
|
||||
tar.bz2|bz2)
|
||||
if bzip2 -t "$archive" >/dev/null 2>&1 || tar -tjf "$archive" >/dev/null 2>&1; then
|
||||
ARCHIVE_DIR=$(dirname "$archive")
|
||||
if [[ "$ext" == "bz2" ]]; then
|
||||
# For bz2 files, we need to decompress in place
|
||||
cp "$archive" "$ARCHIVE_DIR/"
|
||||
bzip2 -d "$ARCHIVE_DIR/$(basename "$archive")"
|
||||
else
|
||||
tar -xjf "$archive" -C "$ARCHIVE_DIR"/
|
||||
fi
|
||||
mv "$archive" "$UNPACKED_DIR/"
|
||||
echo " Successfully extracted $ext and moved to $UNPACKED_DIR"
|
||||
else
|
||||
mv "$archive" "$BROKEN_DIR/"
|
||||
echo " Could not process $ext, moved to $BROKEN_DIR"
|
||||
fi
|
||||
;;
|
||||
tar.xz|xz)
|
||||
if xz -t "$archive" >/dev/null 2>&1 || tar -tJf "$archive" >/dev/null 2>&1; then
|
||||
ARCHIVE_DIR=$(dirname "$archive")
|
||||
if [[ "$ext" == "xz" ]]; then
|
||||
# For xz files, we need to decompress in place
|
||||
cp "$archive" "$ARCHIVE_DIR/"
|
||||
xz -d "$ARCHIVE_DIR/$(basename "$archive")"
|
||||
else
|
||||
tar -xJf "$archive" -C "$ARCHIVE_DIR"/
|
||||
fi
|
||||
mv "$archive" "$UNPACKED_DIR/"
|
||||
echo " Successfully extracted $ext and moved to $UNPACKED_DIR"
|
||||
else
|
||||
mv "$archive" "$BROKEN_DIR/"
|
||||
echo " Could not process $ext, moved to $BROKEN_DIR"
|
||||
fi
|
||||
;;
|
||||
tar)
|
||||
if tar -tf "$archive" >/dev/null 2>&1; then
|
||||
ARCHIVE_DIR=$(dirname "$archive")
|
||||
tar -xf "$archive" -C "$ARCHIVE_DIR"/
|
||||
mv "$archive" "$UNPACKED_DIR/"
|
||||
echo " Successfully extracted TAR and moved to $UNPACKED_DIR"
|
||||
else
|
||||
mv "$archive" "$BROKEN_DIR/"
|
||||
echo " Could not process TAR, moved to $BROKEN_DIR"
|
||||
fi
|
||||
;;
|
||||
esac
|
||||
done
|
||||
done
|
||||
|
||||
echo "Processing complete!"
|
||||
Reference in New Issue
Block a user