I use code to transfer hundreds of large files to a remote server and sometimes after all transfers are done, the remote file size does not match the actual size of the file. I wrote the following code to compare the sizes of the local and remote files, then delete the remote file if the filesize doesn’t match and re-uploads another copy.

Note that some of this code is specific to what I am doing for myself. Happy to answer any questions.

import re
import paramiko
import os
from scp import SCPClient
import sys
import logging
from string import ascii_uppercase

# Create a logger
logging.basicConfig(level=logging.INFO, format='%(asctime)s:%(levelname)s:%(message)s', handlers=[logging.FileHandler("output.log"), logging.StreamHandler()])

# Create the SSH Connection
def createSSHClient(server, user, pem_file):
    client = paramiko.SSHClient()
    client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
    client.connect(server, username=user, key_filename=pem_file)
    return client

def progress4(filename, size, sent, peername):
    sys.stdout.write("(%s:%s) %s's progress: %.2f%%   \r" % (peername[0], peername[1], filename, float(sent)/float(size)*100))

if __name__ == '__main__':
    user = 'username'
    server = 'IP address'
    pem_file = r'\path\to\pem\file.pem'
    
#Find drive letter of local path. Optional, but I do know the local path but I do not know the drive letter since its mapped differently everytime I start my computer
    for drive in ascii_uppercase:
        if os.path.exists(os.path.join(drive + ':\\', 'Shared drives', 'Data',)):
            data_dir = os.path.join(drive + ':\\', 'Shared drives', 'Data')
    remote_dir = '/remote/path/'
    
    logging.info(f"Local directory: {data_dir}")
    logging.info(f"Remote directory: {remote_dir}")
    remote_files_dict = {}
    
    logging.info("Creating SSH Connection")
    ssh_client = createSSHClient(server, user, pem_file)
    scp = SCPClient(ssh_client.get_transport(), progress4=progress4)
    
# This gets the filesizes of the files in the remote dir
    logging.info("Retrieving files and filesizes from remote directory")
    stdin, stdout, stderr = ssh_client.exec_command('cd /remote/path/; ls -l')
    lines = stdout.readlines()
    lines = lines[1:]
    
    # Create a dictionary with the filename as key and filesize as a dictionary
    if len(lines) > 0:
        logging.info("Creating dictionary")
        for line in lines:
            temp = line.split(' ')
            temp_key = temp[8].strip()
            remote_files_dict[temp_key] = int(temp[4])
        
        for key in remote_files_dict:
            if len(key) > 0:
                # This regex is specific to the filenames I am looking for.
                benchmark_folder = re.findall('a\d\d\d', key)[0].upper()
                local_file = os.path.join(data_dir,benchmark_folder,key)
                local_file_size = os.stat(local_file).st_size
                if remote_files_dict[key] == local_file_size:
                    print(key, remote_files_dict[key], local_file_size, True)
                elif int(remote_files_dict[key]) < local_file_size:
                    print(key, remote_files_dict[key], local_file_size, False)
                    logging.warning("Deleting file from remote path")
                    scp.remove(os.path.join(remote_dir,remote_files_dict[key]))
                    logging.info("Copying local file to remote path")
                    scp.put(local_file)
    else:
        logging.info(f"There are no files in {remote_dir}")

By Tony

Leave a Reply

Your email address will not be published. Required fields are marked *

This site uses Akismet to reduce spam. Learn how your comment data is processed.