Generate image dataset from one folder, label each image one by one, for machine learning via tensorflow

This is an update for previous post.
http://lengerrong.blogspot.com/2017/04/create-your-own-dataset-for-machine.html

It is hard to collect enough images for your machine traning.
So I update this post to append images once you find some to your existed dataset.
All you need to do is copy your images to a folder and run the python script.

Of cause, you have to label each image while running the python script.
Below python script just have two label, 0 means cat, 1 means dog.
You can add more labels by just modify blue codes.

The running screenshot :

For my machine traning study example, you can refer to :
https://github.com/lengerrong/tensorflow/tree/master/tensorflow/examples/clixsence


The dataset format is same to cifa10.
the first byte is the label of the first image, which is a number in the range 0-9. The next 3072 bytes are the values of the pixels of the image. The first 1024 bytes are the red channel values, the next 1024 the green, and the final 1024 the blue. The values are stored in row-major order, so the first 32 bytes are the red channel values of the first row of the image.


import os
import subprocess
import sys
import Image

IMAGE_GEN_OK = True
IMAGE_GEN_FAIL = False

def image_gen(filepath, label):
  class ImageGen(object):
    pass
  result = ImageGen()
  result.status = IMAGE_GEN_FAIL
  try:
    im = Image.open(filepath)
    im = (np.array(im))

    r = im[:,:,0].flatten()
    g = im[:,:,1].flatten()
    b = im[:,:,2].flatten()

    labels = [label]
    result.imagebytes = np.array(list(labels) + list(r) + list(g) + list(b), np.uint8)
    result.status = IMAGE_GEN_OK
  except Exception, e:
    print (e)
  return result

def main(argv):
  folder = argv[0]
  if (len(argv) > 1):
    if os.path.exists(argv[1]):
      folder = argv[1]
    else:
      print ('%s not existed' % argv[1])
      return
  else:
    print ("Usage : ")
    print ("python %s data_dir" % argv[0])
    return

  NUM_MAX_EXAMPLES_PER_DATA_BATCH = 3000
  dataset = os.path.join(folder, 'data_batch_1.bin')
  filenames = [os.path.join(folder, 'data_batch_%d.bin' % i)
               for i in xrange(1, 6)]

  dataset_f = open(dataset, "a+")
  if not dataset_f:
    print "unable to open " + dataset + " with w+ mode"
    return
  cc = 0
  dc = 1
  ii = 0
  cat = 0
  dog = 0
  nor = 0
  for f in os.listdir(folder):
    filepath = os.path.join(folder, f)
    try:
      imf = open(filepath, "r")
      im = Image.open(imf)
      imf.close()
      p = subprocess.Popen(["display", filepath])
      ii = ii + 1
      label = raw_input("please label the %d image:\n\t0 means cat:\n\t1 means dog:\n\t-1 not a cat or dog:\n" % ii)
      p.kill()
      if (label == '-1'):
        os.remove(filepath)
        nor = nor + 1
        continue
      elif (label == '1'):
        dog = dog + 1
      elif (label == '0'):
        cat = cat + 1

      print ("generate data for %s" % filepath)
      result = image_gen(filepath, label)
      os.remove(filepath)
      if (result.status == IMAGE_GEN_OK):
        if (dc < 5 and cc >= NUM_MAX_EXAMPLES_PER_DATA_BATCH):
          cc = 0
          dc = dc + 1
          dataset_f.close()
          dataset_f.close()
          dataset = folder + '/data_batch_' + str(dc) + '.bin'
          dataset_f = open(dataset, "a+")
        cc = cc + 1
        result.imagebytes.tofile(dataset_f)
    except Exception, e:
      print ('%s : %s' % (e, filepath))
  dataset_f.close()
  print ("%d cats found, %d dogs found, %d not a cat or dog" % (cat, dog, nor))

if __name__ == '__main__':
  main(sys.argv)



No comments:

Post a Comment

fixed: embedded-redis: Unable to run on macOS Sonoma

Issue you might see below error while trying to run embedded-redis for your testing on your macOS after you upgrade to Sonoma. java.la...