首页 > 网络 > 云计算 > 正文
tensorflow初学之notMNIST
2017-04-27       个评论      
收藏    我要投稿

在之前我们接触了MNIST的相关知识,并且建立了简单的线性逻辑回归模型,接下来,学习了udacity上的tensorflow教程,并完成了任务一notmnist.

环境介绍

win7+python3.5+tensorflow

(环境如果不会可看之前的教程:点击传送门)

notMNIST

notMNIST与MNIST相似,图片大小为28x28,灰度,与MNIST类似。但是较比MNIST,notMNIST含有 A-J 10个类别的艺术印刷体字符,字符的形状各异,噪声更多,更加难以处理

数据下载

首先将测试集和训练集下载到本机,

点击测试集下载链接

点击训练集下载链接

每个每个数据集下有A-J的文件夹,这里下载的是.tar.gz格式的,下载后,右键解压,生成一个.tar文件,然后继续解压,才会解压完成。

打开Jupyter Notebook

由于这里我是采用的anaconda安装的tensorflow,所以在开始菜单,所有程序找到anaconda3,然后找到jupyter notebook打开即可;

如果不是anaconda安装的,可参考之前的教程:点击查看教程

deep learning

这里,我们参考tensorflow下udacity的教程步骤进行的,

引包检查

首先检查,接下来要使用的各种包是否都已经安好,代码如下:

from __future__ import print_function

import matplotlib.pyplot as plt

import numpy as np

import os

import sys

import tarfile

from IPython.display import display, Image

from scipy import ndimage

from sklearn.linear_model import LogisticRegression

from six.moves.urllib.request import urlretrieve

from six.moves import cPickle as pickle

点击jupyter notebook的run cell,(就是一个向右的箭头),点击后不报错即说明各种包已经安好,如果有错,按照错误提示,安装缺失的包即可

problem1:生成pickle

我们没必要将所有数据载入内存,可以利用pickle分别存储每个类型的数据,然后再分别处理。 之后,汇总到一个可具操作性的数据集中。

image_size = 28 # Pixel width and height.

pixel_depth = 255.0 # Number of levels per pixel.

-

def load_letter(folder, min_num_images):

"""Load the data for a single letter label."""

image_files = os.listdir(folder)

dataset = np.ndarray(shape=(len(image_files), image_size, image_size),

dtype=np.float32)

print(folder)

num_images = 0

for image in image_files:

image_file = os.path.join(folder, image)

try:

image_data = (ndimage.imread(image_file).astype(float) -

pixel_depth / 2) / pixel_depth

if image_data.shape != (image_size, image_size):

raise Exception('Unexpected image shape: %s' % str(image_data.shape))

dataset[num_images, :, :] = image_data

num_images = num_images + 1

except IOError as e:

print('Could not read:', image_file, ':', e, '- it\'s ok, skipping.')

dataset = dataset[0:num_images, :, :]

if num_images < min_num_images:

raise Exception('Many fewer images than expected: %d < %d' %

(num_images, min_num_images))

print('Full dataset tensor:', dataset.shape)

print('Mean:', np.mean(dataset))

print('Standard deviation:', np.std(dataset))

return dataset

def maybe_pickle(data_folders, min_num_images_per_class, force=False):

dataset_names = []

for folder in data_folders:

set_filename = folder + '.pickle'

dataset_names.append(set_filename)

if os.path.exists(set_filename) and not force:

# You may override by setting force=True.

print('%s already present - Skipping pickling.' % set_filename)

else:

print('Pickling %s.' % set_filename)

dataset = load_letter(folder, min_num_images_per_class)

try:

with open(set_filename, 'wb') as f:

pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)

except Exception as e:

print('Unable to save data to', set_filename, ':', e)

return dataset_names

train_datasets = maybe_pickle(train_folders, 45000)

test_datasets = maybe_pickle(test_folders, 1800)

训练过程中:

 

\

 

训练完成:

 

\

 

problem2:验证归一化的图像

借助matplotlib.pyplot.显示图片,来检测图片的完好:

with open('G:/人脸识别&机器学习/udacity/notMNIST/notMNIST_small/B.pickle','rb') as pk_f:

data4show = pickle.load(pk_f)

print(data4show.shape)

pic = data4show[0,:,:]

plt.imshow(pic)

plt.show()

正确的结果如图:

 

\

 

problem3:验证数据平衡

对分好的各个数据集检测大小,来保证各个数据集的大小相差不大,保证训练的准确性:

file_path = 'G:/人脸识别&机器学习/udacity/notMNIST/notMNIST_large/{0}.pickle'

for ele in 'ABCDEFJHIJ':

with open(file_path.format(ele), 'rb') as pk_f:

dat = pickle.load(pk_f)

print('number of pictures in {}.pickle = '.format(ele), dat.shape[0])

 

\

 

聚合数据

并给数据打标记。A-J分别对应的是0-9,并且创建了一个用于参数调整的验证集。

def make_arrays(nb_rows, img_size):

if nb_rows:

dataset = np.ndarray((nb_rows, img_size, img_size), dtype=np.float32)

labels = np.ndarray(nb_rows, dtype=np.int32)

else:

dataset, labels = None, None

return dataset, labels

def merge_datasets(pickle_files, train_size, valid_size=0):

num_classes = len(pickle_files)

valid_dataset, valid_labels = make_arrays(valid_size, image_size)

train_dataset, train_labels = make_arrays(train_size, image_size)

vsize_per_class = valid_size // num_classes

tsize_per_class = train_size // num_classes

start_v, start_t = 0, 0

end_v, end_t = vsize_per_class, tsize_per_class

end_l = vsize_per_class+tsize_per_class

for label, pickle_file in enumerate(pickle_files):

try:

with open(pickle_file, 'rb') as f:

letter_set = pickle.load(f)

# let's shuffle the letters to have random validation and training set

np.random.shuffle(letter_set)

if valid_dataset is not None:

valid_letter = letter_set[:vsize_per_class, :, :]

valid_dataset[start_v:end_v, :, :] = valid_letter

valid_labels[start_v:end_v] = label

start_v += vsize_per_class

end_v += vsize_per_class

train_letter = letter_set[vsize_per_class:end_l, :, :]

train_dataset[start_t:end_t, :, :] = train_letter

train_labels[start_t:end_t] = label

start_t += tsize_per_class

end_t += tsize_per_class

except Exception as e:

print('Unable to process data from', pickle_file, ':', e)

raise

return valid_dataset, valid_labels, train_dataset, train_labels

#数目可以自行调整

train_size = 200000

valid_size = 10000

test_size = 10000

valid_dataset, valid_labels, train_dataset, train_labels = merge_datasets(

train_datasets, train_size, valid_size)

_, _, test_dataset, test_labels = merge_datasets(test_datasets, test_size)

print('Training:', train_dataset.shape, train_labels.shape)

print('Validation:', valid_dataset.shape, valid_labels.shape)

print('Testing:', test_dataset.shape, test_labels.shape)

结果如下:

 

\

 

随机化数据:

def randomize(dataset, labels):

permutation = np.random.permutation(labels.shape[0])

shuffled_dataset = dataset[permutation,:,:]

shuffled_labels = labels[permutation]

return shuffled_dataset, shuffled_labels

train_dataset, train_labels = randomize(train_dataset, train_labels)

test_dataset, test_labels = randomize(test_dataset, test_labels)

valid_dataset, valid_labels = randomize(valid_dataset, valid_labels)

problem4:样本乱序与验证

验证清洗后的数据的完好性:

mapping = {key:val for key, val in enumerate('ABCDEFGHIJ')}

def plot_check(matrix, key):

plt.imshow(matrix)

plt.show()

print('the picture should be ', mapping[key])

return None

length = train_dataset.shape[0]-1

# five check

for _ in range(5):

index = np.random.randint(length)

plot_check(train_dataset[index,:,:], train_labels[index])

结果如下,说明完好:

单一图片:

 

\

 

整体展示效果:

 

\

 

保存数据

pickle_file = os.path.join( 'G:/人脸识别&机器学习/udacity/notMNIST/notMNIST.pickle')

try:

f = open(pickle_file, 'wb')

save = {

'train_dataset': train_dataset,

'train_labels': train_labels,

'valid_dataset': valid_dataset,

'valid_labels': valid_labels,

'test_dataset': test_dataset,

'test_labels': test_labels,

}

pickle.dump(save, f, pickle.HIGHEST_PROTOCOL)

f.close()

except Exception as e:

print('Unable to save data to', pickle_file, ':', e)

raise

statinfo = os.stat(pickle_file)

print('Compressed pickle size:', statinfo.st_size)

结果如下:

Compressed pickle size: 718193801

problem5:寻找重叠样本

import multiprocessing as mp

import numpy as np

import pickle as pk

import collections

def check_overlap(dataset1, start, end, dataset2, q):

'''

compute the number of data in dataset1 existing in dataset2[start to end]

'''

overlappingindex = []

length1 = dataset1.shape[0]

for i in range(length1):

for j in range(start, end):

error = np.sum(np.sum(np.abs(dataset1[i,:,:]-dataset2[j,:,:])))

if error < 0.2:

overlappingindex.append((i,j))

break

q.put(tuple(overlappingindex))

return None

print('start loading notMNIST.pickle')

with open('G:/人脸识别&机器学习/udacity/notMNIST/notMNIST.pickle', 'rb') as f:

data = pk.load(f)

print('loading finished.')

q = mp.Queue()

train_dataset = data['train_dataset']

valid_dataset = data['valid_dataset']

length = train_dataset.shape[0]

idx = (0, int(length*1.0/3), int(length*2.0/3), length)

processes = []

for i in range(3):

processes.append(mp.Process(target=check_overlap, args=(valid_dataset, idx[i], idx[i+1], train_dataset, q)))

for p in processes:

p.start()

for p in processes:

p.join()

results = []

while not q.empty():

results.append(q.get())

print(results)

func = lambda x: [i for xs in x for i in func(xs)]if isinstance(x,collections.Iterable)else [x]

print(func(results))import multiprocessing as mp

import numpy as np

import pickle as pk

import collections

def check_overlap(dataset1, start, end, dataset2, q):

'''

compute the number of data in dataset1 existing in dataset2[start to end]

'''

overlappingindex = []

length1 = dataset1.shape[0]

for i in range(length1):

for j in range(start, end):

error = np.sum(np.sum(np.abs(dataset1[i,:,:]-dataset2[j,:,:])))

if error < 0.2:

overlappingindex.append((i,j))

break

q.put(tuple(overlappingindex))

return None

print('start loading notMNIST.pickle')

with open('G:/人脸识别&机器学习/udacity/notMNIST/notMNIST.pickle', 'rb') as f:

data = pk.load(f)

print('loading finished.')

q = mp.Queue()

train_dataset = data['train_dataset']

valid_dataset = data['valid_dataset']

length = train_dataset.shape[0]

idx = (0, int(length*1.0/3), int(length*2.0/3), length)

processes = []

for i in range(3):

processes.append(mp.Process(target=check_overlap, args=(valid_dataset, idx[i], idx[i+1], train_dataset, q)))

for p in processes:

p.start()

for p in processes:

p.join()

results = []

while not q.empty():

results.append(q.get())

print(results)

func = lambda x: [i for xs in x for i in func(xs)]if isinstance(x,collections.Iterable)else [x]

print(func(results))

problem6:训练简单的机器模型

利用scikit-learn提供的逻辑回归模型,分别将训练集的大小设置为50, 100, 1000, 5000 ,然后查看训练效果:

from sklearn.linear_model import LogisticRegression

import pickle

import numpy as np

size = 50

image_size =28

# loading data

print('loading data')

#with open('notMNIST.pickle', 'rb') as f:

# data = pickle.load(f)

print('finish loading')

train_dt = data['train_dataset']

length = train_dt.shape[0]

train_dt = train_dt.reshape(length, image_size*image_size)

train_lb = data['train_labels']

test_dt = data['test_dataset']

length = test_dt.shape[0]

test_lb = data['test_labels']

test_dt = test_dt.reshape(length, image_size*image_size)

def train_linear_logistic(tdata, tlabel):

model = LogisticRegression(C=1.0, penalty='l1')

print('initializing model size is = {}'.format(size))

model.fit(tdata[:size,:], tlabel[:size])

print('testing model')

y_out = model.predict(test_dt)

print('the accurace of the mode of size = {} is {}'.format(size, np.sum(y_out == test_lb)*1.0/len(y_out) ))

return None

train_linear_logistic(train_dt, train_lb)

size = 100

train_linear_logistic(train_dt, train_lb)

size = 1000

train_linear_logistic(train_dt, train_lb)

size = 5000

train_linear_logistic(train_dt, train_lb)

效果如下:

 

\

 

这只是初步的学习,接下来就会学习更多的知识。

点击复制链接 与好友分享!回本站首页
上一篇:flannel实战与源码分析(六)
下一篇:信号与信号量的区别
相关文章
图文推荐
文章
推荐
点击排行

关于我们 | 联系我们 | 广告服务 | 投资合作 | 版权申明 | 在线帮助 | 网站地图 | 作品发布 | Vip技术培训
版权所有: 红黑联盟--致力于做实用的IT技术学习网站