Compressing your block blobs before storing them

It is true that Azure Block Blob storage is dirt cheap.   Why pay for storage though if you don't have to?  Unfortunately the current storage SDK does not have compression built in.   If you'd like to leverage GZip to compress your blobs before putting them into your container it is pretty easy.

I created a simple BlobStorage class that accepts the container name in the constructor then uses GZipStream to compress and decompress a byte array before storing it into Azure.   It depends on how compressible your data is but this has the potential to yield massive savings in terms of both storage and network transfer.

The implementation gives you a upload and download method which accepts the filename and a byte array.

UPDATE:   Based on comments I added in tracking a Boolean to indicate if the blob is compressed in the metadata.   I also added the original MD5 and implemented a basic integrity check to ensure the compress/decompress was successful.

 If you've got a MemoryStream you're working with for example you could use this helper to abstract away the compression/decompression for you as follows:

 byte[] mydata = ms.ToArray();BlobStorage storageClient = new BlobStorage("Container"); //Upload data to containerstorageClient.UploadBlob(mydata, "filename", true); //Download data from containerbyte[] mynewdata = storageClient.DownloadBlob("filename", true);

 

Here is the full code for the helper class (also attached):

 using System;
 using System.Configuration;
 using System.IO;
 using System.IO.Compression;
 using System.Security.Cryptography;
 using Microsoft.WindowsAzure.Storage;
 using Microsoft.WindowsAzure.Storage.Blob;
 
 public class BlobStorage
 {
 CloudStorageAccount storageAccount = CloudStorageAccount.Parse(ConfigurationManager.AppSettings["StorageConnectionString"]);
 CloudBlobClient blobClient;
 CloudBlobContainer container;
 
 
 public BlobStorage(string containername)
 {
 // Create the blob client.
 blobClient = storageAccount.CreateCloudBlobClient();
 container = blobClient.GetContainerReference(containername);
 container.CreateIfNotExists();
 }
 
 
 public CloudBlockBlob UploadBlob(byte[] data, string filename, bool compressed = true)
 {
 
 string origMD5 = MD5(data);
 
 if (compressed)
 {
 using (MemoryStream comp = new MemoryStream())
 {
 using (GZipStream gzip = new GZipStream(comp, CompressionLevel.Optimal))
 {
 gzip.Write(data, 0, data.Length);
 }
 data = comp.ToArray();
 }
 }
 
 CloudBlockBlob blob = container.GetBlockBlobReference(filename);
 blob.Metadata.Add("compressed", compressed.ToString());
 blob.Metadata.Add("origMD5", origMD5);
 blob.UploadFromByteArray(data, 0, data.Length);
 return blob;
 }
 
 public byte[] DownloadBlob(string filename)
 {
 CloudBlockBlob blob = container.GetBlockBlobReference(filename);
 
 byte[] data;
 
 using (MemoryStream ms = new MemoryStream())
 {
 blob.DownloadToStream(ms);
 ms.Seek(0, SeekOrigin.Begin);
 data = ms.ToArray();
 }
 
 blob.FetchAttributes();
 
 if (Convert.ToBoolean(blob.Metadata["compressed"]))
 {
 using (MemoryStream comp = new MemoryStream(data))
 {
 using (MemoryStream decomp = new MemoryStream())
 {
 using (GZipStream gzip = new GZipStream(comp, CompressionMode.Decompress))
 {
 gzip.CopyTo(decomp);
 }
 data = decomp.ToArray();
 }
 }
 }
 
 string origMD5 = blob.Metadata["origMD5"];
 string newMD5 = MD5(data);
 
 if (origMD5 != newMD5)
 { 
 throw new Exception("MD5 hashes do not match after download");
 }
 
 return data;
 }
 
 private static string MD5(byte[] data)
 {
 MD5CryptoServiceProvider x = new System.Security.Cryptography.MD5CryptoServiceProvider();
 byte[] bs = data;
 bs = x.ComputeHash(bs);
 System.Text.StringBuilder s = new System.Text.StringBuilder();
 foreach (byte b in bs)
 {
 s.Append(b.ToString("x2").ToLower());
 }
 return s.ToString();
 }
 }
 
 

BlobStorage.cs