tensorflow初学之notMNIST
2017-04-27       个评论
我要投稿

win7+python3.5+tensorflow

(环境如果不会可看之前的教程：点击传送门)

notMNIST

notMNIST与MNIST相似，图片大小为28x28，灰度，与MNIST类似。但是较比MNIST，notMNIST含有 A-J 10个类别的艺术印刷体字符，字符的形状各异，噪声更多，更加难以处理

deep learning

```from __future__ import print_function

import matplotlib.pyplot as plt

import numpy as np

import os

import sys

import tarfile

from IPython.display import display, Image

from scipy import ndimage

from sklearn.linear_model import LogisticRegression

from six.moves.urllib.request import urlretrieve

from six.moves import cPickle as pickle```

problem1：生成pickle

```image_size = 28 # Pixel width and height.

pixel_depth = 255.0 # Number of levels per pixel.

-

"""Load the data for a single letter label."""

image_files = os.listdir(folder)

dataset = np.ndarray(shape=(len(image_files), image_size, image_size),

dtype=np.float32)

print(folder)

num_images = 0

for image in image_files:

image_file = os.path.join(folder, image)

try:

pixel_depth / 2) / pixel_depth

if image_data.shape != (image_size, image_size):

raise Exception('Unexpected image shape: %s' % str(image_data.shape))

dataset[num_images, :, :] = image_data

num_images = num_images + 1

except IOError as e:

print('Could not read:', image_file, ':', e, '- it\'s ok, skipping.')

dataset = dataset[0:num_images, :, :]

if num_images < min_num_images:

raise Exception('Many fewer images than expected: %d < %d' %

(num_images, min_num_images))

print('Full dataset tensor:', dataset.shape)

print('Mean:', np.mean(dataset))

print('Standard deviation:', np.std(dataset))

return dataset

def maybe_pickle(data_folders, min_num_images_per_class, force=False):

dataset_names = []

for folder in data_folders:

set_filename = folder + '.pickle'

dataset_names.append(set_filename)

if os.path.exists(set_filename) and not force:

# You may override by setting force=True.

print('%s already present - Skipping pickling.' % set_filename)

else:

print('Pickling %s.' % set_filename)

try:

with open(set_filename, 'wb') as f:

pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)

except Exception as e:

print('Unable to save data to', set_filename, ':', e)

return dataset_names

train_datasets = maybe_pickle(train_folders, 45000)

test_datasets = maybe_pickle(test_folders, 1800)```

problem2：验证归一化的图像

```with open('G:/人脸识别&机器学习/udacity/notMNIST/notMNIST_small/B.pickle','rb') as pk_f:

print(data4show.shape)

pic = data4show[0,:,:]

plt.imshow(pic)

plt.show()```

problem3：验证数据平衡

```file_path = 'G:/人脸识别&机器学习/udacity/notMNIST/notMNIST_large/{0}.pickle'

for ele in 'ABCDEFJHIJ':

with open(file_path.format(ele), 'rb') as pk_f:

print('number of pictures in {}.pickle = '.format(ele), dat.shape[0])```

```def make_arrays(nb_rows, img_size):

if nb_rows:

dataset = np.ndarray((nb_rows, img_size, img_size), dtype=np.float32)

labels = np.ndarray(nb_rows, dtype=np.int32)

else:

dataset, labels = None, None

return dataset, labels

def merge_datasets(pickle_files, train_size, valid_size=0):

num_classes = len(pickle_files)

valid_dataset, valid_labels = make_arrays(valid_size, image_size)

train_dataset, train_labels = make_arrays(train_size, image_size)

vsize_per_class = valid_size // num_classes

tsize_per_class = train_size // num_classes

start_v, start_t = 0, 0

end_v, end_t = vsize_per_class, tsize_per_class

end_l = vsize_per_class+tsize_per_class

for label, pickle_file in enumerate(pickle_files):

try:

with open(pickle_file, 'rb') as f:

# let's shuffle the letters to have random validation and training set

np.random.shuffle(letter_set)

if valid_dataset is not None:

valid_letter = letter_set[:vsize_per_class, :, :]

valid_dataset[start_v:end_v, :, :] = valid_letter

valid_labels[start_v:end_v] = label

start_v += vsize_per_class

end_v += vsize_per_class

train_letter = letter_set[vsize_per_class:end_l, :, :]

train_dataset[start_t:end_t, :, :] = train_letter

train_labels[start_t:end_t] = label

start_t += tsize_per_class

end_t += tsize_per_class

except Exception as e:

print('Unable to process data from', pickle_file, ':', e)

raise

return valid_dataset, valid_labels, train_dataset, train_labels

#数目可以自行调整

train_size = 200000

valid_size = 10000

test_size = 10000

valid_dataset, valid_labels, train_dataset, train_labels = merge_datasets(

train_datasets, train_size, valid_size)

_, _, test_dataset, test_labels = merge_datasets(test_datasets, test_size)

print('Training:', train_dataset.shape, train_labels.shape)

print('Validation:', valid_dataset.shape, valid_labels.shape)

print('Testing:', test_dataset.shape, test_labels.shape)```

```def randomize(dataset, labels):

permutation = np.random.permutation(labels.shape[0])

shuffled_dataset = dataset[permutation,:,:]

shuffled_labels = labels[permutation]

return shuffled_dataset, shuffled_labels

train_dataset, train_labels = randomize(train_dataset, train_labels)

test_dataset, test_labels = randomize(test_dataset, test_labels)

valid_dataset, valid_labels = randomize(valid_dataset, valid_labels)

problem4：样本乱序与验证```

```mapping = {key:val for key, val in enumerate('ABCDEFGHIJ')}

def plot_check(matrix, key):

plt.imshow(matrix)

plt.show()

print('the picture should be ', mapping[key])

return None

length = train_dataset.shape[0]-1

# five check

for _ in range(5):

index = np.random.randint(length)

plot_check(train_dataset[index,:,:], train_labels[index])```

```pickle_file = os.path.join( 'G:/人脸识别&机器学习/udacity/notMNIST/notMNIST.pickle')

try:

f = open(pickle_file, 'wb')

save = {

'train_dataset': train_dataset,

'train_labels': train_labels,

'valid_dataset': valid_dataset,

'valid_labels': valid_labels,

'test_dataset': test_dataset,

'test_labels': test_labels,

}

pickle.dump(save, f, pickle.HIGHEST_PROTOCOL)

f.close()

except Exception as e:

print('Unable to save data to', pickle_file, ':', e)

raise

statinfo = os.stat(pickle_file)

print('Compressed pickle size:', statinfo.st_size)```

```Compressed pickle size: 718193801

problem5：寻找重叠样本

import multiprocessing as mp

import numpy as np

import pickle as pk

import collections

def check_overlap(dataset1, start, end, dataset2, q):

'''

compute the number of data in dataset1 existing in dataset2[start to end]

'''

overlappingindex = []

length1 = dataset1.shape[0]

for i in range(length1):

for j in range(start, end):

error = np.sum(np.sum(np.abs(dataset1[i,:,:]-dataset2[j,:,:])))

if error < 0.2:

overlappingindex.append((i,j))

break

q.put(tuple(overlappingindex))

return None

with open('G:/人脸识别&机器学习/udacity/notMNIST/notMNIST.pickle', 'rb') as f:

q = mp.Queue()

train_dataset = data['train_dataset']

valid_dataset = data['valid_dataset']

length = train_dataset.shape[0]

idx = (0, int(length*1.0/3), int(length*2.0/3), length)

processes = []

for i in range(3):

processes.append(mp.Process(target=check_overlap, args=(valid_dataset, idx[i], idx[i+1], train_dataset, q)))

for p in processes:

p.start()

for p in processes:

p.join()

results = []

while not q.empty():

results.append(q.get())

print(results)

func = lambda x: [i for xs in x for i in func(xs)]if isinstance(x,collections.Iterable)else [x]

print(func(results))import multiprocessing as mp

import numpy as np

import pickle as pk

import collections

def check_overlap(dataset1, start, end, dataset2, q):

'''

compute the number of data in dataset1 existing in dataset2[start to end]

'''

overlappingindex = []

length1 = dataset1.shape[0]

for i in range(length1):

for j in range(start, end):

error = np.sum(np.sum(np.abs(dataset1[i,:,:]-dataset2[j,:,:])))

if error < 0.2:

overlappingindex.append((i,j))

break

q.put(tuple(overlappingindex))

return None

with open('G:/人脸识别&机器学习/udacity/notMNIST/notMNIST.pickle', 'rb') as f:

q = mp.Queue()

train_dataset = data['train_dataset']

valid_dataset = data['valid_dataset']

length = train_dataset.shape[0]

idx = (0, int(length*1.0/3), int(length*2.0/3), length)

processes = []

for i in range(3):

processes.append(mp.Process(target=check_overlap, args=(valid_dataset, idx[i], idx[i+1], train_dataset, q)))

for p in processes:

p.start()

for p in processes:

p.join()

results = []

while not q.empty():

results.append(q.get())

print(results)

func = lambda x: [i for xs in x for i in func(xs)]if isinstance(x,collections.Iterable)else [x]

print(func(results))```

problem6：训练简单的机器模型

```from sklearn.linear_model import LogisticRegression

import pickle

import numpy as np

size = 50

image_size =28

#with open('notMNIST.pickle', 'rb') as f:

train_dt = data['train_dataset']

length = train_dt.shape[0]

train_dt = train_dt.reshape(length, image_size*image_size)

train_lb = data['train_labels']

test_dt = data['test_dataset']

length = test_dt.shape[0]

test_lb = data['test_labels']

test_dt = test_dt.reshape(length, image_size*image_size)

def train_linear_logistic(tdata, tlabel):

model = LogisticRegression(C=1.0, penalty='l1')

print('initializing model size is = {}'.format(size))

model.fit(tdata[:size,:], tlabel[:size])

print('testing model')

y_out = model.predict(test_dt)

print('the accurace of the mode of size = {} is {}'.format(size, np.sum(y_out == test_lb)*1.0/len(y_out) ))

return None

train_linear_logistic(train_dt, train_lb)

size = 100

train_linear_logistic(train_dt, train_lb)

size = 1000

train_linear_logistic(train_dt, train_lb)

size = 5000

train_linear_logistic(train_dt, train_lb)```