diff --git a/.travis.yml b/.travis.yml
index fb795aa..e92c398 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -29,7 +29,7 @@ script:
     - cd src && sox ../../wav/wia.wav -t raw -r 16000 - | ./lpcnet_enc -s | ./lpcnet_dec -s > /dev/null
     # some LPCNet ctests
     - ls -l
-    - cd $BUILDDIR && ctest
+    - cd $BUILDDIR && ctest --output-on-failure
     # Re-build codec2 with LPCNet and test FreeDV 2020 support
     - cd $CODEC2DIR/build_linux
     - make clean
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 680f52c..1d5b623 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -10,6 +10,7 @@ project(LPCNet C)
 option(DISABLE_CPU_OPTIMIZATION "Disable CPU optimization discovery." OFF)
 option(AVX2 "Enable AVX2 CPU optimizations." OFF)
 option(AVX "Enable AVX CPU optimizations." OFF)
+option(SSE "Enable SSE CPU optimizations." OFF)
 option(NEON "Enable NEON CPU optimizations for RPi." OFF)
 
 include(GNUInstallDirs)
@@ -19,6 +20,11 @@ mark_as_advanced(CLEAR
     CMAKE_INSTALL_LIBDIR
 )
 
+# Build universal ARM64 and x86_64 binaries on Mac.
+if(BUILD_OSX_UNIVERSAL)
+set(CMAKE_OSX_ARCHITECTURES "x86_64;arm64")
+endif(BUILD_OSX_UNIVERSAL)
+
 #
 # Prevent in-source builds
 # If an in-source build is attempted, you will still need to clean up a few
@@ -43,15 +49,41 @@ set(LPCNET_VERSION_MINOR 2)
 set(LPCNET_VERSION_PATCH FALSE)
 set(LPCNET_VERSION "${LPCNET_VERSION_MAJOR}.${LPCNET_VERSION_MINOR}")
 # Patch level version bumps should not change API/ABI.
-set(SOVERSION "${LPCNET_VERSION_MAJOR}.${LPCNET_VERSION_MINOR}")
+set(LPCNET_SOVERSION "${LPCNET_VERSION_MAJOR}.${LPCNET_VERSION_MINOR}")
 if(LPCNET_VERSION_PATCH)
     set(LPCNET_VERSION "${LPCNET_VERSION}.${LPCNET_VERSION_PATCH}")
 endif()
 message(STATUS "LPCNet version: ${LPCNET_VERSION}")
 
+#
+# Find the git hash if this is a working copy.
+#
+if(EXISTS ${CMAKE_SOURCE_DIR}/.git)
+    find_package(Git QUIET)
+    if(Git_FOUND)
+        execute_process(
+            COMMAND "${GIT_EXECUTABLE}" describe --always HEAD
+            WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
+            RESULT_VARIABLE res
+            OUTPUT_VARIABLE FREEDV_HASH
+            ERROR_QUIET
+            OUTPUT_STRIP_TRAILING_WHITESPACE)
+        message(STATUS "freedv-gui current git hash: ${FREEDV_HASH}")
+        add_definitions(-DGIT_HASH="${FREEDV_HASH}")
+    else()
+        message(WARNING "Git not found. Can not determine current commit hash.")
+        add_definitions(-DGIT_HASH="Unknown")
+    endif()
+else()
+        add_definitions(-DGIT_HASH="None")
+endif()
+
 # Set default flags
 set(CMAKE_C_FLAGS "-Wall -W -Wextra -Wno-unused-function -O3 -g -I. -MD ${CMAKE_C_FLAGS} -DENABLE_ASSERTIONS")
 
+# Arch specific stuff here
+message(STATUS "Host system arch is: ${CMAKE_SYSTEM_PROCESSOR}")
+
 # Detection of available CPU optimizations
 if(NOT DISABLE_CPU_OPTIMIZATION)
     if(UNIX AND NOT APPLE)
@@ -60,15 +92,25 @@ if(NOT DISABLE_CPU_OPTIMIZATION)
             OUTPUT_VARIABLE AVX2)
         execute_process(COMMAND grep -c "avx " /proc/cpuinfo
             OUTPUT_VARIABLE AVX)
+        execute_process(COMMAND grep -c "sse4_1 " /proc/cpuinfo
+            OUTPUT_VARIABLE SSE)
         execute_process(COMMAND grep -c "neon" /proc/cpuinfo
             OUTPUT_VARIABLE NEON)
     elseif(APPLE)
-        # Under OSX we need to look through a few sysctl entries to determine what our CPU supports.
-        message(STATUS "Looking for available CPU optimizations on an OSX system...")
-        execute_process(COMMAND sysctl -a COMMAND grep machdep.cpu.leaf7_features COMMAND grep -c AVX2
-            OUTPUT_VARIABLE AVX2)
-        execute_process(COMMAND sysctl -a COMMAND grep machdep.cpu.features COMMAND grep -c AVX
-            OUTPUT_VARIABLE AVX)
+        if(BUILD_OSX_UNIVERSAL)
+            # Presume AVX/AVX2 are enabled on the x86 side. The ARM side will auto-enable
+            # NEON optimizations by virtue of being aarch64.
+            set(AVX TRUE)
+            set(AVX2 TRUE)
+            set(SSE TRUE)
+        else()
+            # Under OSX we need to look through a few sysctl entries to determine what our CPU supports.
+            message(STATUS "Looking for available CPU optimizations on an OSX system...")
+            execute_process(COMMAND sysctl -a COMMAND grep machdep.cpu.leaf7_features COMMAND grep -c AVX2
+                OUTPUT_VARIABLE AVX2)
+            execute_process(COMMAND sysctl -a COMMAND grep machdep.cpu.features COMMAND grep -c AVX
+                OUTPUT_VARIABLE AVX)
+        endif(BUILD_OSX_UNIVERSAL)
     elseif(WIN32)
         message(STATUS "No detection capability on Windows, assuming AVX is available.")
         set(AVX TRUE)
@@ -85,9 +127,13 @@ elseif(${AVX} OR ${AVX} GREATER 0)
 # AVX2 machines will also match on AVX
     message(STATUS "avx processor flags found or enabled.")
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx")
+elseif(${SSE} OR ${SSE} GREATER 0)
+# AVX and AVX2 machines will also match on SSE
+    message(STATUS "sse processor flags found or enabled.")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse4.1")
 endif()
 
-# RPi
+# RPi / ARM 32bit
 if(${NEON} OR ${NEON} GREATER 0)
     message(STATUS "neon processor flags found or enabled.")
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfpu=neon -march=armv8-a -mtune=cortex-a53")
diff --git a/README.md b/README.md
index c446450..5b72d8c 100644
--- a/README.md
+++ b/README.md
@@ -25,14 +25,22 @@ LPCNet at 1733 bits/s using direct-split quantiser:
 ```
 sox ../../wav/wia.wav -t raw -r 16000 - | ./lpcnet_enc -s | ./lpcnet_dec -s | aplay -f S16_LE -r 16000
 ```
-# CTests
+
+## Manually Selecting SIMD Technology
+
+Cmake will select the fastest SIMD available (AVX/SSSE/None), however you can manually select e.g.:
+```
+make -DDISABLE_CPU_OPTIMIZATION=ON -DSSE=ON -DCODEC2_BUILD_DIR=~/codec2/build_linux ..
+```
+
+## CTests
 
 ```
 $ cd ~/LPCNet/build_linux
 $ ctest
 ```
 
-Note, due to precision/library issues several tests (1-3) will only pass on certain machines such as Ubuntu 16 and 18, Ubuntu 17 is known to fail.
+Note, due to precision/library issues several tests (1-3) will [only pass on some machines](https://github.com/drowe67/LPCNet/issues/17).
 
 # Reading Further
 
diff --git a/src/700c_train.sh b/src/700c_train.sh
new file mode 100755
index 0000000..3be057e
--- /dev/null
+++ b/src/700c_train.sh
@@ -0,0 +1,73 @@
+#!/bin/bash -x
+# 700c_train.sh
+# David Rowe March 2020
+# Experiments in LPCNet decoding of Codec 2 700C
+
+PATH=$HOME/codec2/build_linux/src:$HOME/LPCNet/build_linux/src:$HOME/LPCNet/src:$PATH
+
+if [ "$#" -ne 1 ]; then
+    echo "usage: ./700c_train.sh datestamp"
+    echo "       ./700c_train.sh 200404"
+    exit 0
+fi
+
+train1=dev-clean-8k
+test1=test-clean-8k
+test2=all_speech_subset_8k
+test3=all_8k
+datestamp=$1
+epochs=30
+log=${1}.txt
+train=${datestamp}_train
+
+# synth "c2sim arg for experiment" "experiment label" "filename"
+synth() {
+    test=$3
+    c2sim ~/Downloads/${test}.sw --rateKWov ${test}.f32 ${1}
+    test_lpcnet --mag 2 --frame_size 80 --pre 0 ${test}.f32 ${datestamp}_${test}_${2}.sw
+}
+
+# experient "c2sim arg for experiment" "experiment label"
+experiment() {
+    echo "------------------------------------------------------------------------------"
+    echo "train starting" ${2}
+    echo "------------------------------------------------------------------------------"
+    
+    c2sim ${train}.sw --ten_ms_centre ${train}_10ms.sw --rateKWov ${train}.f32 ${1}
+    sw2packedulaw --frame_size 80 ${train}_10ms.sw ${train}.f32 ${train}_10ms.pulaw
+
+    train_lpcnet.py ${train}.f32 ${train}_10ms.pulaw ${datestamp}_${2} --epochs ${epochs} --frame_size 80
+    
+    dump_lpcnet.py ${datestamp}_${2}_${epochs}.h5
+    cp nnet_data.c src
+    make test_lpcnet
+
+    synth "${1}" "${2}" "${test1}"
+    synth "${1}" "${2}" "${test2}"
+    synth "${1}" "${2}" "${test3}"
+}
+
+rm -f $log
+
+(
+    date
+
+    # assemble some training speech
+    sox -r 8000 -c 1 ~/Downloads/${train1}.sw \
+	-t sw -r 8000 -c 1 ${train}.sw    
+
+    # LPCNet with 10ms frames (similar to training data) 
+    experiment "" "none"
+    
+    # Codec 2 700C at 40ms frame rate (700 bits/s) from c2dec
+    c2enc 700C ~/Downloads/${test1}.sw - --eq --var | c2dec 700C - /dev/null --mlfeat ${test1}_dec4.f32
+    test_lpcnet --mag 2 --frame_size 80 --pre 0 ${test1}_dec4.f32 ${datestamp}_${test1}_40.sw
+    c2enc 700C ~/Downloads/${test2}.sw - --eq --var | c2dec 700C - /dev/null --mlfeat ${test2}_dec4.f32
+    test_lpcnet --mag 2 --frame_size 80 --pre 0 ${test2}_dec4.f32 ${datestamp}_${test2}_40.sw
+    c2enc 700C ~/Downloads/${test3}.sw - --eq --var | c2dec 700C - /dev/null --mlfeat ${test3}_dec4.f32
+    test_lpcnet --mag 2 --frame_size 80 --pre 0 ${test3}_dec4.f32 ${datestamp}_${test3}_40.sw
+    
+    date
+) |& tee $log
+
+
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 41a78dc..0df4672 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -23,6 +23,8 @@ add_library(lpcnetfreedv SHARED ${lpcnet_freedv_srcs})
 target_link_libraries(lpcnetfreedv codec2)
 set_target_properties(lpcnetfreedv PROPERTIES
     PUBLIC_HEADER lpcnet_freedv.h
+	VERSION ${LPCNET_VERSION}
+	SOVERSION ${LPCNET_SOVERSION}
 )
 target_include_directories(lpcnetfreedv INTERFACE
     $<INSTALL_INTERFACE:include/lpcnet>
@@ -49,11 +51,11 @@ target_link_libraries(dump_data lpcnetfreedv m codec2)
 add_executable(test_lpcnet test_lpcnet.c)
 target_link_libraries(test_lpcnet lpcnetfreedv m codec2)
 
-if(AVX OR AVX2)
+if(SSE OR AVX OR AVX2 OR CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
     add_executable(test_vec test_vec.c)
     target_link_libraries(test_vec m)
 else()
-    message(WARNING "No AVX/AVX2 CPU flags identified, not building test_vec.")
+    message(WARNING "No SSE/AVX/AVX2 CPU flags identified, not building test_vec.")
 endif()
 
 add_executable(quant_feat quant_feat.c)
@@ -98,6 +100,12 @@ target_link_libraries(idct lpcnetfreedv m codec2)
 add_executable(nnet2f32 nnet2f32.c)
 target_link_libraries(nnet2f32 lpcnetfreedv m)
 
+add_executable(sw2packedulaw sw2packedulaw.c)
+target_link_libraries(sw2packedulaw lpcnetfreedv m)
+
+add_executable(thash thash.c)
+target_link_libraries(thash lpcnetfreedv m)
+
 install(TARGETS lpcnet_enc lpcnet_dec
     RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
     )
diff --git a/src/codec2_pitch.c b/src/codec2_pitch.c
index a267785..55fb5bc 100644
--- a/src/codec2_pitch.c
+++ b/src/codec2_pitch.c
@@ -113,6 +113,7 @@ int codec2_pitch_est(CODEC2_PITCH *pitch, float Sn[], float *f0, float *voicing)
 
 void codec2_pitch_destroy(CODEC2_PITCH *pitch)
 {
+    free(pitch->fft_fwd_cfg);
     nlp_destroy(pitch->nlp_states);
     free(pitch->w);
     free(pitch);
diff --git a/src/concat.sh b/src/concat.sh
old mode 100644
new mode 100755
index 8369117..d98ccda
--- a/src/concat.sh
+++ b/src/concat.sh
@@ -1,6 +1,8 @@
-# Place in 16k-LP7 from TSPSpeech.iso and run to concatenate wave files
-# into one headerless training file
-for i in */*.wav
+#!/bin/bash
+# Concatenate .wav files into one headerless .sw training file
+# usage: ./concat.sh concatfile.sw
+
+for i in `find . -name '*.wav'`
 do
 sox $i -r 16000 -c 1 -t sw -
-done > input.s16
+done > $1
diff --git a/src/dump_data.c b/src/dump_data.c
index cd936cf..4e8d3c4 100644
--- a/src/dump_data.c
+++ b/src/dump_data.c
@@ -453,6 +453,7 @@ int main(int argc, char **argv) {
 	assert(pitch_index < 2*PITCH_MAX_PERIOD);
 	assert(pitch_index >= 2*PITCH_MIN_PERIOD);
         features[2*NB_BANDS] = 0.01*(pitch_index-200);
+	//fprintf(stderr, "count: %d [36] %f pitch_index: %d\n", count, features[36], pitch_index);
         if (c2voicing_en) features[2*NB_BANDS+1] = voicing;
     }
     fwrite(features, sizeof(float), NB_FEATURES, ffeat);
diff --git a/ext_pitch.sh b/src/ext_pitch.sh
similarity index 100%
rename from ext_pitch.sh
rename to src/ext_pitch.sh
diff --git a/src/flac_to_wav.sh b/src/flac_to_wav.sh
new file mode 100755
index 0000000..8f8aa29
--- /dev/null
+++ b/src/flac_to_wav.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+# Convert all .flac files under this folder to .wav files
+# source: several GitHub repos
+
+find . -iname "*.flac" | wc
+
+for flacfile in `find . -iname "*.flac"`
+do
+    ffmpeg -y -f flac -i $flacfile -ab 64k -ac 1 -ar 16000 -f wav "${flacfile%.*}.wav"
+done
diff --git a/src/freq.c b/src/freq.c
index c88d071..dbe94d9 100644
--- a/src/freq.c
+++ b/src/freq.c
@@ -140,6 +140,13 @@ static void check_init() {
   common.init = 1;
 }
 
+void freq_close() {
+    if (common.init) {
+        opus_fft_free(common.kfft,0);
+        common.init = 0;
+    }
+}
+
 void dct(float *out, const float *in) {
   int i;
   check_init();
diff --git a/src/freq.h b/src/freq.h
index 0316edd..314eabd 100644
--- a/src/freq.h
+++ b/src/freq.h
@@ -42,6 +42,7 @@
 
 #define NB_BANDS 18
 
+void freq_close(void);
 void compute_band_energy(float *bandE, const kiss_fft_cpx *X);
 void compute_band_corr(float *bandE, const kiss_fft_cpx *X, const kiss_fft_cpx *P);
 
diff --git a/src/lpcnet.c b/src/lpcnet.c
index e117f1c..9f3f059 100644
--- a/src/lpcnet.c
+++ b/src/lpcnet.c
@@ -54,8 +54,10 @@ struct LPCNetState {
     float old_lpc[FEATURES_DELAY][LPC_ORDER];
     float old_gain[FEATURES_DELAY];
     int frame_count;
+    float preemph;
     float deemph_mem;
-    FILE *ftest;                    /* used to dump states for automates tests */
+    int   pitch_embedding;
+    FILE *ftest;                    /* used to dump states for automated tests */
 };
 
 
@@ -118,6 +120,8 @@ LPCNetState *lpcnet_create()
     lpcnet = (LPCNetState *)calloc(sizeof(LPCNetState), 1);
     lpcnet->last_exc = 128;
     lpcnet->ftest = NULL;
+    lpcnet->preemph = PREEMPH;
+    lpcnet->pitch_embedding = 1;
     return lpcnet;
 }
 
@@ -135,7 +139,15 @@ void lpcnet_open_test_file(LPCNetState *lpcnet, char file_name[]) {
     }
 }
 
-void lpcnet_synthesize(LPCNetState *lpcnet, short *output, const float *features, int N, int logmag)
+void lpcnet_set_preemph(LPCNetState *lpcnet, float preemph) {
+    lpcnet->preemph = preemph;
+}
+
+void lpcnet_set_pitch_embedding(LPCNetState *lpcnet, int val) {
+    lpcnet->pitch_embedding = val;
+}
+
+void lpcnet_synthesize(LPCNetState *lpcnet, short *output, float *features, int N, int mag)
 {
     static int count = 0;
     int i;
@@ -149,13 +161,19 @@ void lpcnet_synthesize(LPCNetState *lpcnet, short *output, const float *features
     static int start = 0; /*(LPC_ORDER+1*/;
     /* FIXME: Do proper rounding once the Python code rounds properly. */
 
-    pitch = (int)floor(.1 + 50*features[36]+100);    
-    assert(pitch >=0); assert(pitch <= 255);    
-    /* latest networks (using the codec 2 pitch estimator) are trained
-       with pitch estimates between 40 and 255, but due to the pitch
-       quantiser design and bit errors it's possible to get pitch
-       values down to 32, which upsets the pitch embed matrix */
-    if (pitch < 40) pitch = 40;
+    if (lpcnet->pitch_embedding) {
+	pitch = (int)floor(.1 + 50*features[36]+100);
+	//fprintf(stderr, "count: %d [36] %f pitch: %d\n", lpcnet->frame_count, features[36], pitch);
+	assert(pitch >=0); assert(pitch <= 255);    
+	/* latest networks (using the codec 2 pitch estimator) are trained
+	   with pitch estimates between 40 and 255, but due to the pitch
+	   quantiser design and bit errors it's possible to get pitch
+	   values down to 32, which upsets the pitch embed matrix */
+	if (pitch < 40) pitch = 40;
+    }
+    else {
+	pitch = 0;
+    }
     
     pitch_gain = lpcnet->old_gain[FEATURES_DELAY-1];
     memmove(&lpcnet->old_gain[1], &lpcnet->old_gain[0], (FEATURES_DELAY-1)*sizeof(lpcnet->old_gain[0]));
@@ -164,13 +182,30 @@ void lpcnet_synthesize(LPCNetState *lpcnet, short *output, const float *features
     memcpy(lpc, lpcnet->old_lpc[FEATURES_DELAY-1], LPC_ORDER*sizeof(lpc[0]));
     memmove(lpcnet->old_lpc[1], lpcnet->old_lpc[0], (FEATURES_DELAY-1)*LPC_ORDER*sizeof(lpc[0]));
 
-    if (logmag) {
-        float tmp[NB_BANDS];
+    switch (mag) {
+    case 0:
+	lpc_from_cepstrum(lpcnet->old_lpc[0], features);
+	break;
+    case 1:
+    {
+	float tmp[NB_BANDS];
         for (i=0;i<NB_BANDS;i++) tmp[i] = pow(10.f, features[i]);
         lpc_from_bands(lpcnet->old_lpc[0], tmp);
     }
-    else
-	lpc_from_cepstrum(lpcnet->old_lpc[0], features);
+	break;
+    case 2:
+        for (i=0;i<LPC_ORDER;i++) {
+	    lpcnet->old_lpc[0][i] = features[i+NB_BANDS];
+	}
+	break;
+    default:
+	assert(0);
+    }
+
+    /* We optinally use this part of feature vector to pass in LPCs,
+     * but we don't want any non zero values here hitting the
+     * frame rate network.  TODO: better design */
+    RNN_CLEAR(&features[18], 18); 
 
     if (lpcnet->ftest) {
         float pitch_f = pitch;
@@ -220,7 +255,7 @@ void lpcnet_synthesize(LPCNetState *lpcnet, short *output, const float *features
         RNN_MOVE(&lpcnet->last_sig[1], &lpcnet->last_sig[0], LPC_ORDER-1);
         lpcnet->last_sig[0] = pcm;
         lpcnet->last_exc = exc;
-        pcm += PREEMPH*lpcnet->deemph_mem;
+        pcm += lpcnet->preemph*lpcnet->deemph_mem;
         lpcnet->deemph_mem = pcm;
         if (pcm<-32767) pcm = -32767;
         if (pcm>32767) pcm = 32767;
diff --git a/src/lpcnet.h b/src/lpcnet.h
index 70e849e..bd98a37 100644
--- a/src/lpcnet.h
+++ b/src/lpcnet.h
@@ -34,8 +34,10 @@
 typedef struct LPCNetState LPCNetState;
 LPCNetState *lpcnet_create();
 void lpcnet_destroy(LPCNetState *lpcnet);
-void lpcnet_synthesize(LPCNetState *lpcnet, short *output, const float *features, int N, int logmag);
+void lpcnet_synthesize(LPCNetState *lpcnet, short *output, float *features, int N, int logmag);
 
 void lpcnet_open_test_file(LPCNetState *lpcnet, char file_name[]);
+void lpcnet_set_preemph(LPCNetState *lpcnet, float preemph);
+void lpcnet_set_pitch_embedding(LPCNetState *lpcnet, int val);
 
 #endif
diff --git a/src/lpcnet.py b/src/lpcnet.py
index 010f478..960e8c8 100644
--- a/src/lpcnet.py
+++ b/src/lpcnet.py
@@ -36,7 +36,6 @@ import numpy as np
 import h5py
 import sys
 
-frame_size = 160
 pcm_bits = 8
 embed_size = 128
 pcm_levels = 2**pcm_bits
@@ -113,7 +112,7 @@ class PCMInit(Initializer):
             'seed': self.seed
         }
 
-def new_lpcnet_model(rnn_units1=384, rnn_units2=16, nb_used_features = 38, training=False, use_gpu=True):
+def new_lpcnet_model(frame_size = 160, rnn_units1=384, rnn_units2=16, nb_used_features = 38, training=False, use_gpu=True):
     pcm = Input(shape=(None, 3))
     feat = Input(shape=(None, nb_used_features))
     pitch = Input(shape=(None, 1))
diff --git a/src/lpcnet_dump.c b/src/lpcnet_dump.c
index d8a8409..58f9c98 100644
--- a/src/lpcnet_dump.c
+++ b/src/lpcnet_dump.c
@@ -87,7 +87,8 @@ static DenoiseState *rnnoise_create() {
 }
 
 static void rnnoise_destroy(DenoiseState *st) {
-  free(st);
+    freq_close();
+    free(st);
 }
 
 static short float2short(float x)
diff --git a/src/lpcnet_freedv.c b/src/lpcnet_freedv.c
index 823fcdc..fe154ea 100644
--- a/src/lpcnet_freedv.c
+++ b/src/lpcnet_freedv.c
@@ -80,3 +80,9 @@ void lpcnet_dec(LPCNetFreeDV *lf, char *frame, short* pcm)
 
 int lpcnet_samples_per_frame(LPCNetFreeDV *lf) { return FRAME_SIZE*lf->q->dec; } 
 int lpcnet_bits_per_frame(LPCNetFreeDV *lf) { return lf->q->bits_per_frame; } 
+
+static char git_hash[] = GIT_HASH;
+char *lpcnet_get_hash(void) {
+    return git_hash;
+}
+
diff --git a/src/lpcnet_freedv.h b/src/lpcnet_freedv.h
index 43c8298..874f7cc 100644
--- a/src/lpcnet_freedv.h
+++ b/src/lpcnet_freedv.h
@@ -8,6 +8,10 @@
 #ifndef __LPCNET_FREEDV__
 #define __LPCNET_FREEDV__
 
+#ifdef __cplusplus
+  extern "C" {
+#endif
+
 typedef struct LPCNetFreeDV LPCNetFreeDV;
 
 LPCNetFreeDV* lpcnet_freedv_create(int direct_split);
@@ -16,5 +20,10 @@ void lpcnet_enc(LPCNetFreeDV *lf, short *pcm, char *frame);
 void lpcnet_dec(LPCNetFreeDV *lf, char *frame, short* pcm);
 int lpcnet_bits_per_frame(LPCNetFreeDV *lf);
 int lpcnet_samples_per_frame(LPCNetFreeDV *lf);
+char *lpcnet_get_hash(void);
+
+#ifdef __cplusplus
+}
+#endif
 
 #endif
diff --git a/src/nnet.c b/src/nnet.c
index 8ad4a26..1da7d70 100644
--- a/src/nnet.c
+++ b/src/nnet.c
@@ -43,7 +43,9 @@
 
 #ifdef __AVX__
 #include "vec_avx.h"
-#elif __ARM_NEON__
+#elif __SSE__
+#include "vec_sse.h"
+#elif __ARM_NEON__ || __aarch64__
 #include "vec_neon.h"
 #else
 #warning Compiling without any vectorization. This code will be very slow
diff --git a/src/plot_lpc.m b/src/plot_lpc.m
new file mode 100644
index 0000000..3b814be
--- /dev/null
+++ b/src/plot_lpc.m
@@ -0,0 +1,50 @@
+% plot_lpc.m
+% David Rowe April 2020
+%
+% Visualise LPC spectra for 700C decoder experiments
+
+Fs  = 8000;       % speech sample rate
+Fsf = 100;        % frame sample rate
+nb_features = 55;
+nb_rateK = 18;    % number of rateK (log amplitude) features
+nb_lpc = 10;      % number of LPCs
+
+function plot_against_time(v, st_sec, en_sec, Fs, leg='b')
+  st = Fs*st_sec; en = Fs*en_sec;
+  t = st_sec:1/Fs:en_sec;
+  plot(t,v(st+1:en+1),leg);
+endfunction
+
+function mesh_against_time(m, st_sec, en_sec, Fs)
+  st = Fs*st_sec; en = Fs*en_sec;
+  t = st_sec:1/Fs:en_sec;
+  mesh(m(st+1:en+1,:));  
+endfunction
+
+function mesh_aks_against_time(aks, st_sec, en_sec, Fs)
+  st = Fs*st_sec; en = Fs*en_sec;
+  t = st_sec:1/Fs:en_sec;
+  aks = aks(st+1:en+1,:); A = [];
+  for f=1:length(aks)
+    A = [A freqz(1,[1 aks(f,:)],64)];
+  end
+  AdB = 20*log10(abs(A));
+  max(AdB(:))
+  mesh(AdB);  
+endfunction
+
+# plots of speech (input), rateK vectors, LPC spectra
+
+features=load_f32("../build_linux/all_8k.f32", nb_features);
+rateK=features(:, 1:nb_rateK);
+aks = features(:, nb_rateK+1:nb_rateK+nb_lpc);
+fs=fopen("../build_linux/all_8k_10ms.sw","rb");
+s = fread(fs,Inf,"short");
+fclose(fs);
+
+st_sec=14; en_sec=16;
+
+figure(1); clf; plot_against_time(s, st_sec, en_sec, Fs, 'b')
+figure(2); clf; mesh_against_time(rateK, st_sec, en_sec, Fsf);
+figure(3); clf; mesh_aks_against_time(aks, st_sec, en_sec, Fsf);
+
diff --git a/src/plot_pulaw.py b/src/plot_pulaw.py
new file mode 100755
index 0000000..10d5656
--- /dev/null
+++ b/src/plot_pulaw.py
@@ -0,0 +1,52 @@
+#!/usr/bin/python3
+# Utility to inspect packed ulaw samples from sw2packedulaw.c (or dump_data.c) before training 
+
+import numpy as np
+import matplotlib.pyplot as plt
+import sys
+import ulaw
+import argparse
+
+parser = argparse.ArgumentParser(description='Plot LPCNet training packed ulaw samples')
+parser.add_argument('file1', help='pulaw file of packed ulaw samples')
+parser.add_argument('--file2', help='optional second packed ulaw file to compare')
+parser.add_argument('--nb_samples', type=int, default=-1, help='Optional number of samples to plot')
+args = parser.parse_args()
+
+data = np.fromfile(args.file1, dtype='uint8')
+nb_samples = args.nb_samples
+data = data[:nb_samples]
+
+sig = np.array(data[0::4], dtype='float')
+pred = np.array(data[1::4], dtype='float')
+in_exc = np.array(data[2::4], dtype='float')
+out_exc = np.array(data[3::4], dtype='float')
+   
+print("exc var: %4.3e" % (np.var(ulaw.ulaw2lin(in_exc))))
+
+plt.figure(1)
+plt.subplot(211)
+plt.plot(ulaw.ulaw2lin(sig), label='sig')
+plt.ylim((-30000,30000))
+plt.legend()
+plt.subplot(212)
+plt.plot(ulaw.ulaw2lin(pred), label='pred')
+plt.ylim((-30000,30000))
+plt.legend()
+plt.show(block=False)
+
+plt.figure(2)
+plt.subplot(211)
+plt.plot(ulaw.ulaw2lin(in_exc), label='in_exc')
+if args.file2:
+    data2 = np.fromfile(args.file2, dtype='uint8')
+    data2 = data2[:nb_samples]
+    in_exc2 = np.array(data2[2::4], dtype='float')
+    plt.plot(ulaw.ulaw2lin(in_exc2), label='in_exc2')
+plt.ylim((-30000,30000))
+plt.legend()
+plt.subplot(212)
+plt.plot(ulaw.ulaw2lin(out_exc), label='out_exc')
+plt.ylim((-30000,30000))
+plt.legend()
+plt.show()
diff --git a/src/plot_train.py b/src/plot_train.py
index 910d7e9..7e2bc7b 100644
--- a/src/plot_train.py
+++ b/src/plot_train.py
@@ -3,11 +3,10 @@ import numpy as np
 import sys
 
 loss = np.loadtxt(sys.argv[1])
-delta_loss = (loss[1:,0]-loss[:-1,0])/loss[1:,0]
+delta_loss = (loss[1:]-loss[:-1])/loss[1:]
 
 plt.figure(1)
-plt.plot(loss[:,0],'r')
-plt.plot(loss[:,1],'g')
+plt.plot(loss[:],'r')
 plt.title('loss')
 plt.show(block=False)
 plt.figure(2)
diff --git a/src/plot_train.sh b/src/plot_train.sh
index 2a1fddf..3c86094 100755
--- a/src/plot_train.sh
+++ b/src/plot_train.sh
@@ -6,5 +6,5 @@
 # plot graphs of loss and spares categorical accuracy to get a feel
 # for progress while training
 
-grep loss $1 | sed -n 's/.*===\].*loss: \(.*\) - val_loss: \(.*\)/\1 \2/p' > loss.txt
-python3 plot_train.py loss.txt
+grep loss $1 | sed -n 's/.*===\].*step - loss: \(.*\)/\1/p' > loss.txt
+python3 ~/LPCNet/src/plot_train.py loss.txt
diff --git a/process.sh b/src/process.sh
similarity index 100%
rename from process.sh
rename to src/process.sh
diff --git a/src/sw2packedulaw.c b/src/sw2packedulaw.c
new file mode 100644
index 0000000..7724158
--- /dev/null
+++ b/src/sw2packedulaw.c
@@ -0,0 +1,188 @@
+/*
+  sw2packedulaw.c
+
+  Convert signed word samples to packed ulaw samples to drive LPCNet
+  training, this code is a cut/paste from dump_data.c witha few other
+  options.
+
+  By varying the LPC predictor coefficients we can try no predictor,
+  first order, and regular LPC.
+
+  1. No prediction (WaveRNN I guess):
+    $ ~/codec2/build_linux/src/c2sim ~/Downloads/all_8k.sw --ten_ms_centre all_8k_10ms.sw --rateKWov all_8k.f32 
+    $ ./src/sw2packedulaw --frame_size 80 all_8k_10ms.sw all_8k.f32 all_8k_none.pulaw
+    $ ../src/plot_pulaw.py all_8k_none.pulaw
+
+  2. First order predictor:
+    $ ~/codec2/build_linux/src/c2sim ~/Downloads/all_8k.sw --ten_ms_centre all_8k_10ms.sw --rateKWov all_8k.f32 --first
+    $ ./src/sw2packedulaw --frame_size 80 all_8k_10ms.sw all_8k.f32 all_8k_first.pulaw
+
+  3. LPC with ulaw Q in the loop and noise injection (standard LPCNet design):
+    $ ~/codec2/build_linux/src/c2sim ~/Downloads/all_8k.sw --ten_ms_centre all_8k_10ms.sw --rateKWov all_8k.f32 --lpc 10
+    $ ./src/sw2packedulaw --frame_size 80all_8k_10ms.sw  all_8k.f32 all_8k.pulaw
+
+  4. LPC with no Q in the loop or noise injection (linear):
+    $ ./src/sw2packedulaw --frame_size 80 --linear all_8k_10ms.sw all_8k.f32 all_8k_linear.pulaw
+
+  See plot_pulaw.py to inspect output .pulaw files
+*/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include "common.h"
+#include <math.h>
+#include "freq.h"
+#include "pitch.h"
+#include "arch.h"
+#include "celt_lpc.h"
+#include <assert.h>
+#include <getopt.h>
+
+#define NB_FEATURES 55
+#define CODEC2_LPC_ORDER 10
+
+typedef struct {
+  float lpc[LPC_ORDER];
+  float sig_mem[LPC_ORDER];
+  int exc_mem;
+} DenoiseState;
+
+void write_audio(DenoiseState *st, const short *pcm, float noise_std, FILE *file, int frame_size) {
+  int i;
+  unsigned char data[4*frame_size];
+  for (i=0;i<frame_size;i++) {
+    int noise;
+    float p=0;
+    float e;
+    int j;
+    for (j=0;j<LPC_ORDER;j++) p -= st->lpc[j]*st->sig_mem[j];
+    e = lin2ulaw(pcm[i] - p);
+    /* Signal. */
+    data[4*i] = lin2ulaw(st->sig_mem[0]);
+    /* Prediction. */
+    data[4*i+1] = lin2ulaw(p);
+    /* Excitation in. */
+    data[4*i+2] = st->exc_mem;
+    /* Excitation out. */
+    data[4*i+3] = e;
+    /* Simulate error on excitation. */
+    noise = (int)floor(.5 + noise_std*.707*(log_approx((float)rand()/RAND_MAX)-log_approx((float)rand()/RAND_MAX)));
+    e += noise;
+    e = IMIN(255, IMAX(0, e));
+    
+    RNN_MOVE(&st->sig_mem[1], &st->sig_mem[0], LPC_ORDER-1);
+    st->sig_mem[0] = p + ulaw2lin(e);
+    st->exc_mem = e;
+  }
+  fwrite(data, 4*frame_size, 1, file);
+}
+
+/* takes ulaw out of predictor path, and no noise injection */
+void write_audio_linear(DenoiseState *st, const short *pcm, FILE *file, int frame_size) {
+  int i;
+  unsigned char data[4*frame_size];
+  for (i=0;i<frame_size;i++) {
+    float p=0;
+    float e;
+    int j;
+    for (j=0;j<LPC_ORDER;j++) p -= st->lpc[j]*st->sig_mem[j];
+    e = pcm[i] - p;
+    //fprintf(stderr,"pcm: %d p: %f e: %f\n", pcm[i], p, e);
+    /* Signal. */
+    data[4*i] = lin2ulaw(st->sig_mem[0]);
+    /* Prediction. */
+    data[4*i+1] = lin2ulaw(p);
+    /* Excitation in. */
+    data[4*i+2] = st->exc_mem;
+    /* Excitation out. */
+    data[4*i+3] = lin2ulaw(e);
+
+    RNN_MOVE(&st->sig_mem[1], &st->sig_mem[0], LPC_ORDER-1);
+    st->sig_mem[0] = pcm[i];
+    st->exc_mem = lin2ulaw(e);
+  }
+  fwrite(data, 4*frame_size, 1, file);
+}
+
+int main(int argc, char *argv[]) {
+    int linear = 0;
+    int frame_size = FRAME_SIZE;
+    
+    DenoiseState st;
+    memset(&st, 0, sizeof(DenoiseState));
+    st.exc_mem = 128;
+    
+    int o = 0;
+    int opt_idx = 0;
+    while( o != -1 ) {
+        static struct option long_opts[] = {
+            {"linear", no_argument, 0, 'l'},
+            {"frame_size", required_argument, 0, 'f'},
+            {0, 0, 0, 0}
+        };
+        
+	o = getopt_long(argc,argv,"l",long_opts,&opt_idx);
+        
+	switch(o){
+	case 'f':
+	    frame_size = atoi(optarg);
+	    fprintf(stderr, "frame_size: %d\n", frame_size);
+	    break;
+	case 'l':
+	    linear = 1;
+	    break;
+	case '?':
+	    goto helpmsg;
+	    break;
+	}
+    }
+    int dx = optind;
+
+    if ((argc - dx) < 3) {
+    helpmsg:
+        fprintf(stderr, "usage: s2packedulaw Input.s16 FeatureFile.f32 Output.pulaw\n");
+        return 0;
+    }
+
+    FILE *fsw = fopen(argv[dx], "rb");
+    if (fsw == NULL) {
+	fprintf(stderr, "Can't open %s\n", argv[dx]);
+	exit(1);
+    }
+    
+    FILE *ffeature = fopen(argv[dx+1], "rb");
+    if (ffeature == NULL) {
+	fprintf(stderr, "Can't open %s\n", argv[dx+1]);
+	exit(1);
+    }
+    
+    FILE *fpackedpcm = fopen(argv[dx+2], "wb");
+    if (fpackedpcm == NULL) {
+	fprintf(stderr, "Can't open %s\n", argv[dx+2]);
+	exit(1);
+    }
+    
+    short frame[frame_size];
+    while (fread(frame, sizeof(short), frame_size, fsw) == (unsigned)frame_size) {
+	float features[NB_FEATURES];
+	int ret = fread(features, sizeof(float), NB_FEATURES, ffeature);
+	if (ret != NB_FEATURES) {
+	    fprintf(stderr, "feature file ended early!\n");
+	    exit(1);		
+	}
+	for(int i=0; i<CODEC2_LPC_ORDER; i++) {
+	    st.lpc[i] = features[18+i];
+	}
+	if (linear)
+	    write_audio_linear(&st, frame, fpackedpcm, frame_size);
+	else {
+	    write_audio(&st, frame, 0.5, fpackedpcm, frame_size);
+	}
+    }
+
+    fclose(fsw);
+    fclose(ffeature);
+    fclose(fpackedpcm);
+    return 0;
+}
+
diff --git a/src/test_lpcnet.c b/src/test_lpcnet.c
index 0a34729..e8c9907 100644
--- a/src/test_lpcnet.c
+++ b/src/test_lpcnet.c
@@ -36,26 +36,37 @@
 int main(int argc, char **argv) {
     FILE *fin, *fout;
     LPCNetState *net;
-    int logmag = 0;
-
+    int mag = 0;
+    int frame_size = FRAME_SIZE;
+    
     net = lpcnet_create();
     
     int o = 0;
     int opt_idx = 0;
     while( o != -1 ) {
         static struct option long_opts[] = {
-            {"mag", no_argument, 0, 'i'},
-            {"nnet", required_argument, 0, 'n'},
+            {"frame_size", required_argument, 0, 'f'},
             {"logstates", required_argument, 0, 'l'},
-            {0, 0, 0, 0}
+            {"mag", required_argument, 0, 'i'},
+            {"nnet", required_argument, 0, 'n'},
+            {"no_pitch_embedding", no_argument, 0, 'e'},
+            {"pre", required_argument, 0, 'p'},
+           {0, 0, 0, 0}
         };
         
 	o = getopt_long(argc,argv,"ihn:l:",long_opts,&opt_idx);
         
 	switch(o){
+	case 'e':
+	    lpcnet_set_pitch_embedding(net, 0);
+	    break;
+	case 'f':
+	    frame_size = atoi(optarg);
+	    fprintf(stderr, "frame_size: %d\n", frame_size);
+	    break;
 	case 'i':
-	    logmag = 1;
-	    fprintf(stderr, "logmag: %d\n", logmag);
+	    mag = atoi(optarg);
+	    fprintf(stderr, "mag: %d\n", mag);
 	    break;
 	case 'l':
 	    fprintf(stderr, "logstates file: %s\n", optarg);
@@ -65,6 +76,10 @@ int main(int argc, char **argv) {
 	    fprintf(stderr, "loading nnet: %s\n", optarg);
 	    nnet_read(optarg);
 	    break;
+	case 'p':
+	    if (atoi(optarg) == 0)
+		lpcnet_set_preemph(net, 0.0);
+	    break;
 	case '?':
 	    goto helpmsg;
 	    break;
@@ -74,7 +89,9 @@ int main(int argc, char **argv) {
 
     if ((argc - dx) < 2) {
     helpmsg:
-        fprintf(stderr, "usage: test_lpcnet [--mag] [--logstates statesfile] [--nnet lpcnet_xxx.f32] <features.f32> <output.pcm>\n");
+        fprintf(stderr, "usage: test_lpcnet [--mag 1|2] [--logstates statesfile] [--nnet lpcnet_xxx.f32]"
+		" [--framesize samples] [--pre 0|1] <features.f32> <output.s16>\n");
+	fprintf(stderr, "--mag -i 0-cepstrals, 1-logmag, 2-disable LPC (WaveRNN)\n");
         return 0;
     }
 
@@ -99,13 +116,12 @@ int main(int argc, char **argv) {
     while (1) {
         float in_features[NB_TOTAL_FEATURES];
         float features[NB_FEATURES];
-        short pcm[FRAME_SIZE];
+        short pcm[frame_size];
         int nread = fread(in_features, sizeof(features[0]), NB_TOTAL_FEATURES, fin);
         if (nread != NB_TOTAL_FEATURES) break;
         RNN_COPY(features, in_features, NB_FEATURES);
-        RNN_CLEAR(&features[18], 18);
-        lpcnet_synthesize(net, pcm, features, FRAME_SIZE, logmag);
-        fwrite(pcm, sizeof(pcm[0]), FRAME_SIZE, fout);
+        lpcnet_synthesize(net, pcm, features, frame_size, mag);
+        fwrite(pcm, sizeof(pcm[0]), frame_size, fout);
         if (fout == stdout) fflush(stdout);
     }
     fclose(fin);
diff --git a/src/test_vec.c b/src/test_vec.c
index 09b51e7..efa617e 100644
--- a/src/test_vec.c
+++ b/src/test_vec.c
@@ -26,7 +26,10 @@ const char simd[]="AVX2";
 #else
 const char simd[]="AVX";
 #endif
-#elif __ARM_NEON__
+#elif __SSE__
+#include "vec_sse.h"
+const char simd[]="SSE";
+#elif __ARM_NEON__ || __aarch64__
 #include "vec_neon.h"
 const char simd[]="NEON";
 #else
diff --git a/src/thash.c b/src/thash.c
new file mode 100644
index 0000000..5b60f2e
--- /dev/null
+++ b/src/thash.c
@@ -0,0 +1,19 @@
+/*---------------------------------------------------------------------------*\
+
+  FILE........: thash.c
+  AUTHOR......: David Rowe
+  DATE CREATED: July 2020
+
+  Simple test program for LPCNet API get hash function
+
+\*---------------------------------------------------------------------------*/
+
+#include <stdio.h>
+#include "lpcnet_freedv.h"
+
+int main(void) { 
+    printf("%s\n", lpcnet_get_hash());
+    return 0;
+}
+
+
diff --git a/train_direct.sh b/src/train_direct.sh
similarity index 100%
rename from train_direct.sh
rename to src/train_direct.sh
diff --git a/src/train_lpcnet.py b/src/train_lpcnet.py
index 62abbd7..94ab9a8 100755
--- a/src/train_lpcnet.py
+++ b/src/train_lpcnet.py
@@ -35,9 +35,14 @@ from keras.callbacks import ModelCheckpoint
 from ulaw import ulaw2lin, lin2ulaw
 import keras.backend as K
 import h5py
-
+import argparse
+import os
 import tensorflow as tf
 from keras.backend.tensorflow_backend import set_session
+import matplotlib.pyplot as plt
+
+# less verbose tensorflow ....
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
 config = tf.ConfigProto()
 
 # use this option to reserve GPU memory, e.g. for running more than
@@ -46,23 +51,38 @@ config = tf.ConfigProto()
 
 set_session(tf.Session(config=config))
 
-nb_epochs = 10
-
 # Try reducing batch_size if you run out of memory on your GPU
 batch_size = 32
+# with of feature records used for training
+nb_features = 55
+
+parser = argparse.ArgumentParser(description='LPCNet training')
+parser.add_argument('feature_file', help='.f32 file of float features')
+parser.add_argument('packed_ulaw_file', help='file of 4 multiplexed ulaw samples per speech sample')
+parser.add_argument('prefix', help='.h5 file prefix to easily identify each experiment')
+parser.add_argument('--frame_size', type=int, default=160, help='frames size in samples')
+parser.add_argument('--epochs', type=int, default=20, help='Number of training epochs')
+parser.add_argument('--no_pitch_embedding', action='store_true', help='disable pitch embedding')
+parser.add_argument('--load_h5', help='disable pitch embedding')
+args = parser.parse_args()
 
-model, _, _ = lpcnet.new_lpcnet_model(training=True)
+nb_epochs = args.epochs
+
+model, _, _ = lpcnet.new_lpcnet_model(frame_size=args.frame_size, training=True)
 
 model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['sparse_categorical_accuracy'])
 model.summary()
 
-feature_file = sys.argv[1]
-pcm_file = sys.argv[2]            # 16 bit unsigned short PCM samples
-prefix = sys.argv[3]              # prefix to put on .h5 files to easily name each experiment
+if args.load_h5:
+    print("loading: %s" % (args.load_h5))
+    model.load_weights(args.load_h5)
+
+feature_file = args.feature_file
+pcm_file = args.packed_ulaw_file           
+prefix = args.prefix              
 frame_size = model.frame_size
-nb_features = 55
 nb_used_features = model.nb_used_features
-feature_chunk_size = 15
+feature_chunk_size = 15 # time window for conv1d/receptive field
 pcm_chunk_size = frame_size*feature_chunk_size
 
 # u for unquantised, load 16 bit PCM samples and convert to mu-law
@@ -84,7 +104,17 @@ in_exc = np.reshape(data[2::4], (nb_frames, pcm_chunk_size, 1))
 out_exc = np.reshape(data[3::4], (nb_frames, pcm_chunk_size, 1))
 del data
 
-print("ulaw std = ", np.std(out_exc))
+"""
+# plot ulaw signals to sanity check
+testf=10
+print(sig.shape)
+#plt.plot(sig[testf,:],label="sig")
+#plt.plot(pred[testf,:],label="pred")
+plt.plot(in_exc[testf,:],label="in_exc")
+plt.plot(out_exc[testf,:],label="out_exc")
+plt.legend()
+plt.show()
+"""
 
 features = np.reshape(features, (nb_frames, feature_chunk_size, nb_features))
 features = features[:, :, :nb_used_features]
@@ -93,12 +123,34 @@ features = features[:, :, :nb_used_features]
 # nb_used_features=38, so 0...37, so lpc-gain not used
 features[:,:,18:36] = 0   # zero out 18..35, so pitch and pitch gain being fed in, lpc gain ignored
 
+"""
+# plot features to sanity check
+print(features.shape)
+testf=10
+plt.plot(features[testf,:,37:38])
+plt.show()
+"""
+
 fpad1 = np.concatenate([features[0:1, 0:2, :], features[:-1, -2:, :]], axis=0)
 fpad2 = np.concatenate([features[1:, :2, :], features[0:1, -2:, :]], axis=0)
 features = np.concatenate([fpad1, features, fpad2], axis=1)
 
-# pitch feature uses as well as cesptrals
+# pitch feature uses as well as cepstrals
 periods = (.1 + 50*features[:,:,36:37]+100).astype('int16')
+print(periods.shape)
+if args.no_pitch_embedding:
+    print("no_pitch_embedding")
+    periods[:] = 0
+# sanity check training data aginst pitch embedding range
+assert np.all(periods >= 40), "pitch embedding < 40"
+assert np.all(periods < 256), "pitch embeddeding > 255"
+
+"""
+# plot pitch to sanity check
+print(features.shape, periods.shape)
+plt.plot(periods.reshape(-1)[:1000])
+plt.show()
+"""
 
 in_data = np.concatenate([sig, pred, in_exc], axis=-1)
 
@@ -108,9 +160,8 @@ del in_exc
 
 # dump models to disk as we go
 #checkpoint = ModelCheckpoint('lpcnet20h_384_10_G16_{epoch:02d}.h5')
-checkpoint = ModelCheckpoint(prefix + '_{epoch:02d}.h5')
+checkpoint = ModelCheckpoint(prefix + '_{epoch:d}.h5')
 
 # use this to reload a partially trained model
-#model.load_weights('lpcnet_190203_07.h5')
 model.compile(optimizer=Adam(0.001, amsgrad=True, decay=5e-5), loss='sparse_categorical_crossentropy')
-model.fit([in_data, features, periods], out_exc, batch_size=batch_size, epochs=nb_epochs, validation_split=0.1, callbacks=[checkpoint, lpcnet.Sparsify(2000, 40000, 400, (0.05, 0.05, 0.2))])
+model.fit([in_data, features, periods], out_exc, batch_size=batch_size, epochs=nb_epochs, callbacks=[checkpoint, lpcnet.Sparsify(2000, 40000, 400, (0.05, 0.05, 0.2))])
diff --git a/train_pred2.sh b/src/train_pred2.sh
similarity index 100%
rename from train_pred2.sh
rename to src/train_pred2.sh
diff --git a/src/vec_avx.h b/src/vec_avx.h
index 1e58f8d..520b5b2 100644
--- a/src/vec_avx.h
+++ b/src/vec_avx.h
@@ -79,7 +79,7 @@ static __m128 exp4_approx(__m128 X)
    Y = _mm_castsi128_ps(_mm_and_si128(mask, _mm_add_epi32(I, _mm_castps_si128(Y))));
    return Y;
 }
-static __m256 exp8_approx(__m256 X)
+static inline __m256 exp8_approx(__m256 X)
 {
    __m256 Y;
    __m128 Xhi, Xlo, Yhi, Ylo;
diff --git a/src/vec_sse.h b/src/vec_sse.h
new file mode 100644
index 0000000..82ddd42
--- /dev/null
+++ b/src/vec_sse.h
@@ -0,0 +1,211 @@
+/* Copyright (c) 2020 SASANO Takayoshi
+                 2018 David Rowe
+                 2018 Mozilla
+                 2008-2011 Octasic Inc.
+                 2012-2017 Jean-Marc Valin */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+/*
+  SSE implementation of vector operations, compile with -msse
+  port from Arm NEON support
+*/
+
+#include <xmmintrin.h>
+
+#ifndef LPCNET_TEST
+static float celt_exp2(float x)
+{
+    int integer;
+    float frac;
+    union {
+	float f;
+	opus_uint32 i;
+    } res;
+    integer = floor(x);
+    if (integer < -50)
+	return 0;
+    frac = x-integer;
+    /* K0 = 1, K1 = log(2), K2 = 3-4*log(2), K3 = 3*log(2) - 2 */
+    res.f = 0.99992522f + frac * (0.69583354f
+				  + frac * (0.22606716f + 0.078024523f*frac));
+    res.i = (res.i + (integer<<23)) & 0x7fffffff;
+    return res.f;
+}
+#define celt_exp_sse(x) celt_exp2((x)*1.44269504f)
+
+static float tansig_approx(float x)
+{
+    int i;
+    float y, dy;
+    float sign=1;
+    /* Tests are reversed to catch NaNs */
+    if (!(x<8))
+        return 1;
+    if (!(x>-8))
+        return -1;
+#ifndef FIXED_POINT
+    /* Another check in case of -ffast-math */
+    if (celt_isnan(x))
+	return 0;
+#endif
+    if (x<0)
+    {
+	x=-x;
+	sign=-1;
+    }
+    i = (int)floor(.5f+25*x);
+    x -= .04f*i;
+    y = tansig_table[i];
+    dy = 1-y*y;
+    y = y + x*dy*(1 - y*x);
+    return sign*y;
+}
+
+static OPUS_INLINE float sigmoid_approx(float x)
+{
+    return .5f + .5f*tansig_approx(.5f*x);
+}
+
+static void softmax(float *y, const float *x, int N)
+{
+    int i;
+    for (i=0;i<N;i++)
+        y[i] = celt_exp_sse(x[i]);
+}
+
+static void vec_tanh(float *y, const float *x, int N)
+{
+    int i;
+    for (i=0;i<N;i++)
+    {
+        y[i] = tansig_approx(x[i]);
+    }
+}
+
+static void vec_sigmoid(float *y, const float *x, int N)
+{
+    int i;
+    for (i=0;i<N;i++)
+    {
+        y[i] = sigmoid_approx(x[i]);
+    }
+}
+#endif
+
+static void sgemv_accum16(float *out, const float *weights, int rows, int cols, int col_stride, const float *x)
+{
+    int i, j;
+    for (i=0;i<rows;i+=16)
+    {
+	float * restrict y = &out[i];
+      
+	/* keep y[0..15] in registers for duration of inner loop */
+      
+	__m128 y0_3 = _mm_loadu_ps(&y[0]);
+	__m128 y4_7 = _mm_loadu_ps(&y[4]);
+	__m128 y8_11 = _mm_loadu_ps(&y[8]);
+	__m128 y12_15 = _mm_loadu_ps(&y[12]);
+      
+	for (j=0;j<cols;j++)
+	{
+	    const float * restrict w;
+	    __m128 wvec0_3, wvec4_7, wvec8_11, wvec12_15;
+	    __m128 xj = _mm_set1_ps(x[j]);
+
+	    w = &weights[j*col_stride + i];
+
+	    wvec0_3 = _mm_loadu_ps(&w[0]);
+	    wvec4_7 = _mm_loadu_ps(&w[4]);
+	    wvec8_11 = _mm_loadu_ps(&w[8]);
+	    wvec12_15 = _mm_loadu_ps(&w[12]);
+
+	    wvec0_3 = _mm_mul_ps(wvec0_3, xj);
+	    wvec4_7 = _mm_mul_ps(wvec4_7, xj);
+	    wvec8_11 = _mm_mul_ps(wvec8_11, xj);
+	    wvec12_15 = _mm_mul_ps(wvec12_15, xj);
+
+	    y0_3 = _mm_add_ps(y0_3, wvec0_3);
+	    y4_7 = _mm_add_ps(y4_7, wvec4_7);
+	    y8_11 = _mm_add_ps(y8_11, wvec8_11);
+	    y12_15 = _mm_add_ps(y12_15, wvec12_15);
+	}
+
+	/* save y[0..15] back to memory */
+      
+	_mm_storeu_ps(&y[0], y0_3);
+	_mm_storeu_ps(&y[4], y4_7);
+	_mm_storeu_ps(&y[8], y8_11);
+	_mm_storeu_ps(&y[12], y12_15);
+    }
+}
+
+static void sparse_sgemv_accum16(float *out, const float *w, int rows, const int *idx, const float *x)
+{
+    int i, j;
+    for (i=0;i<rows;i+=16)
+    {
+	int cols;
+	cols = *idx++;
+	float * restrict y = &out[i];
+
+	/* keep y[0..15] in registers for duration of inner loop */
+      
+	__m128 y0_3 = _mm_loadu_ps(&y[0]);
+	__m128 y4_7 = _mm_loadu_ps(&y[4]);
+	__m128 y8_11 = _mm_loadu_ps(&y[8]);
+	__m128 y12_15 = _mm_loadu_ps(&y[12]);
+      
+	for (j=0;j<cols;j++)
+	{
+	    __m128 wvec;
+	    __m128 xj = _mm_set1_ps(x[*idx++]);
+
+	    wvec = _mm_loadu_ps(&w[0]);
+	    wvec = _mm_mul_ps(wvec, xj);
+	    y0_3 = _mm_add_ps(y0_3, wvec);
+
+	    wvec = _mm_loadu_ps(&w[4]);
+	    wvec = _mm_mul_ps(wvec, xj);
+	    y4_7 = _mm_add_ps(y4_7, wvec);
+
+	    wvec = _mm_loadu_ps(&w[8]);
+	    wvec = _mm_mul_ps(wvec, xj);
+	    y8_11 = _mm_add_ps(y8_11, wvec);
+
+	    wvec = _mm_loadu_ps(&w[12]);
+	    wvec = _mm_mul_ps(wvec, xj);
+	    y12_15 = _mm_add_ps(y12_15, wvec);
+
+	    w += 16;
+	}
+
+	/* save y[0..15] back to memory */
+      
+	_mm_storeu_ps(&y[0], y0_3);
+	_mm_storeu_ps(&y[4], y4_7);
+	_mm_storeu_ps(&y[8], y8_11);
+	_mm_storeu_ps(&y[12], y12_15);
+    }
+}
diff --git a/train_pred1.sh b/train_pred1.sh
deleted file mode 100755
index 3694252..0000000
--- a/train_pred1.sh
+++ /dev/null
@@ -1,31 +0,0 @@
-#!/bin/sh -x
-# train_pred2.sh
-# David Rowe Jan 2019
-# Train multi-stage VQ for LPCNet
-
-PATH=$PATH:/home/david/codec2-dev/build_linux/misc/
-
-if [ $# -lt 1 ]; then
-    echo "usage: ./train_pred1.sh [-w] VQprefix"
-    echo "       $ ./train_pred1.sh pred1_v1"
-    exit 1
-fi
-
-VQ_NAME=$1
-echo $VQ_NAME
-
-K=18
-STOP=1E-2
-
-echo "*********"
-echo "Pred 1"
-echo "*********"
-echo "weighting dctLy[0] ...."
-t=$(mktemp)
-extract all_speech_features.f32 $t 0 17 10 1.0 1
-cat $t | ./weight > $VQ_NAME'_s0.f32'
-vqtrain $VQ_NAME'_s0.f32' $K 2048 $VQ_NAME'_stage1.f32' -r $VQ_NAME'_s1.f32' -s $STOP 
-vqtrain $VQ_NAME'_s1.f32' $K 2048 $VQ_NAME'_stage2.f32' -r $VQ_NAME'_s2.f32' -s $STOP
-vqtrain $VQ_NAME'_s2.f32' $K 2048 $VQ_NAME'_stage3.f32' -r $VQ_NAME'_s3.f32' -s $STOP 
-vqtrain $VQ_NAME'_s3.f32' $K 2048 $VQ_NAME'_stage4.f32' -r $VQ_NAME'_s4.f32' -s $STOP 
-
diff --git a/unittest/test_core_nn.sh b/unittest/test_core_nn.sh
index 392c897..cd955c7 100755
--- a/unittest/test_core_nn.sh
+++ b/unittest/test_core_nn.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/bin/bash -x
 # test_core_nn.sh
 #
 
@@ -60,7 +60,7 @@ if [ ! -z $SYNTH_mag ]; then
     ../build_linux/src/dump_data --mag --test --c2pitch ../wav/c01_01.wav c01_01.f32
     diff c01_01_mag.f32 c01_01.f32 || { echo "ERROR in synth .f32 output! Exiting..."; exit 1; }
     echo "mag .f32 OK"
-    ../build_linux/src/test_lpcnet --mag -n lpcnet_190804a.f32 c01_01.f32 c01_01_out.raw
+    ../build_linux/src/test_lpcnet --mag 1 -n lpcnet_190804a.f32 c01_01.f32 c01_01_out.raw
     diff c01_01_190804a_targ.raw c01_01_out.raw || { echo "ERROR in synth .raw output! Exiting..."; exit 1; }
     echo "mag .raw OK"
 fi