pm = dict(
    
    # Data selection
    data_dir = '/Users/zhafen/data/literature_topography',
    region_number = 8,
    convergence_degree = 3,
    kernel_size = 16,
    
    # Features
    numerical_variables = [
        'age',
        'references_count',
        'page_count',
        'log_author_count',
    ],
    categorical_variables = [
        'journal_filtered',
    ],
    semantic_variables = [
        'density',
        'asymmetry',
    ],
    
)


import numpy as np
import os


# My custom non-relational-data-management package
import verdict
# My library for NLP analysis of scientific abstracts
from cc import atlas, cartography, utils


# Load summary information.
# I analyzed several randomly chosen specializations ("regions"), of which we are choosing an arbitrary one.
summary_data_fp = os.path.join( pm['data_dir'], 'regions', 'regions_summary.h5' )
data = verdict.Dict.from_hdf5( summary_data_fp )
data_k = data['regions'][str(pm['region_number'])]


# Class for management of abstracts
atlas_dir = os.path.join( pm['data_dir'], 'regions', 'region_{}'.format( pm['region_number'] ) )
a = atlas.Atlas( atlas_dir, load_bibtex=False )

Loading saved atlas data.

0it [00:00, ?it/s]
100%|█████████████████████████████████| 37311/37311 [00:01<00:00, 20387.92it/s]


# Class for analysis of vectorized abstracts
projection = a.vectorize(
    verbose = True,
)
c = cartography.Cartographer( **projection )

Vectorizing text...
Using saved vectorized text...


# Retrieve metrics I calculated in external pre-processing
metrics_fp = os.path.join( atlas_dir, 'topography_metrics.h5' )
metrics = verdict.Dict.from_hdf5( metrics_fp )


# Not all the publications are viable for analysis.
# I've saved information about what publications are viable, and here we load the identifying information.
converged_kernel_size = data_k['converged_kernel_size'][:,-pm['convergence_degree']]
converged = converged_kernel_size >= pm['kernel_size']
publications = c.publications[converged]
inds = np.arange( c.publications.size )[converged]


import copy
import pandas as pd
import warnings


# Make into a dataframe
df_data = copy.deepcopy( metrics )
df_data['projection_ind'] = inds
df = pd.DataFrame(
    data = df_data._storage,
    index=publications,
)


# Rename one of the metrics to something more intuitive
df.rename( { 'fringe_factor': 'asymmetry' }, axis='columns', inplace=True )


# Drop publications with no citations.
# This catches all grant submissions, etc.
df = df.loc[np.invert( np.isclose( df['citations_per_year'], 0. ) )]


# Add logscale versions for some variables
for column in [ 'density', 'citations_per_year', ]:
    df['log_{}'.format( column )] = np.log10( df[column] )


# Drop columns for which we're missing abstract data (will show up as a nan in density)
df.dropna(subset=['density',], inplace=True)


import tqdm


citation_keys = []
additional_data = {
    'references_count': [],
    'pages': [],
    'author': [],
    'journal': [],
    'title': [],
    'abstract_character_count': [],
    'entry_date': [],
}
for citation_key, p in tqdm.tqdm( a.data.items() ):
    
    citation_keys.append( citation_key )
    
    # number of references
    if p.references is None:
        additional_data['references_count'].append( 0 )
    else:
        additional_data['references_count'].append( len( p.references ) )
    
    # Citation info
    for key in [ 'pages', 'author', 'journal', 'title' ]:
        try:
            additional_data[key].append( p.citation[key] )
        except KeyError:
            additional_data[key].append( pd.NA )
            
    # Abstract
    additional_data['abstract_character_count'].append( len( p.abstract_str() ) )
    
    # Entry date
    additional_data['entry_date'].append( p.entry_date )

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 37311/37311 [00:02<00:00, 15558.42it/s]


# Convert to datetime
additional_data['entry_date'] = pd.to_datetime( additional_data['entry_date'] )


# Join it onto the existing dataframe
additional_df = pd.DataFrame( data=additional_data, index=citation_keys )
df = df.join( additional_df )


df['ind'] = np.arange( df.index.size )


# Setup data structurs
df['page_count'] = np.full( len( df ), np.nan )


# Get rid of the "L" in front of the pages for publications submitted to letters.
pages_str = df['pages'].str.replace( 'L|P', '' )

/var/folders/43/wy_ws5nx3830gqhfw05n4yyc0000gn/T/ipykernel_31523/1933168570.py:2: FutureWarning: The default value of regex will change from True to False in a future version.
  pages_str = df['pages'].str.replace( 'L|P', '' )


# Split into two to take the difference
pages_split = pages_str.str.split( '-', expand=True )


# Identify the parseable data
is_not_none = np.invert( pages_split[1].isnull() )
is_numeric = pages_split[1].str.isnumeric()
is_page_range = is_not_none & is_numeric


# For the valid page ranges, set the page count
df.loc[is_page_range,'page_count'] = (
    pages_split[1].loc[is_page_range].astype( int )
    - pages_split[0].loc[is_page_range].astype( int )
)


# There can be one or two edge cases where there's a negative page count because of the formatting
df.loc[df['page_count']<0,'page_count'] = np.nan


df['author_count'] = df['author'].str.split( ' and ' ).apply( len )
df['log_author_count'] = df['author_count'].apply( np.log10 )


df['title_character_count'] = df['title'].str.len()


# Find the most common journals
df_grouped = df.groupby( 'journal' )
journal_entry_count = df_grouped.size().sort_values( ascending=False )
most_common_journals = journal_entry_count.iloc[:5].index


most_common_journals

Index(['\apj', '\mnras', '\aap', '\aj', '\apjl'], dtype='object', name='journal')


# Make a new column accordingly
df['journal_filtered'] = df['journal'].copy()
is_not_common_journal = np.invert( df['journal'].isin( most_common_journals ) )
df.loc[is_not_common_journal,'journal_filtered'] = 'other'
df.loc[df['journal'].isna(),'journal_filtered'] = 'other'


v = c.vectors[df['projection_ind']]


df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7359 entries, 2004JMOp...51.1447V to 1972ApJ...175L..73S
Data columns (total 23 columns):
 #   Column                     Non-Null Count  Dtype              
---  ------                     --------------  -----              
 0   age                        7359 non-null   float64            
 1   citations_per_year         7359 non-null   float64            
 2   converged_kernel_size      7359 non-null   int32              
 3   density                    7359 non-null   float64            
 4   asymmetry                  7359 non-null   float64            
 5   kernel_constant_asymmetry  7359 non-null   float64            
 6   smoothing_length           7359 non-null   float64            
 7   projection_ind             7359 non-null   int64              
 8   log_density                7359 non-null   float64            
 9   log_citations_per_year     7359 non-null   float64            
 10  references_count           7359 non-null   int64              
 11  pages                      7334 non-null   object             
 12  author                     7359 non-null   object             
 13  journal                    7037 non-null   object             
 14  title                      7359 non-null   object             
 15  abstract_character_count   7359 non-null   int64              
 16  entry_date                 7359 non-null   datetime64[ns, UTC]
 17  ind                        7359 non-null   int64              
 18  page_count                 4134 non-null   float64            
 19  author_count               7359 non-null   int64              
 20  log_author_count           7359 non-null   float64            
 21  title_character_count      7359 non-null   int64              
 22  journal_filtered           7359 non-null   object             
dtypes: datetime64[ns, UTC](1), float64(10), int32(1), int64(6), object(5)
memory usage: 1.6+ MB


df.sample(5)

v

<7359x30286 sparse matrix of type '<class 'numpy.int64'>'
	with 670079 stored elements in Compressed Sparse Row format>


from sklearn.model_selection import train_test_split


# Split dataframe
df_train, df_test = train_test_split( df, test_size=0.2, random_state=42 )


# Split word vector input
v_train = v[df_train['ind']]
v_test = v[df_test['ind']]


# Split semantic input
M_train = df_train[pm['semantic_variables']].values
M_test = df_test[pm['semantic_variables']].values


# Split metadata input
X_train_df = df_train[pm['numerical_variables'] + pm['categorical_variables']]
X_test_df = df_test[pm['numerical_variables'] + pm['categorical_variables']]


# Split derived (metadata+semantic) input
D_train_df = df_train[pm['numerical_variables'] + pm['categorical_variables'] + pm['semantic_variables']]
D_test_df = df_test[pm['numerical_variables'] + pm['categorical_variables'] + pm['semantic_variables']]


# Split output
y_train = df_train['log_citations_per_year'].values
y_test = df_test['log_citations_per_year'].values


import seaborn as sns

sns.set_style( 'whitegrid' )


sns.histplot(
    df_train,
    x = 'log_citations_per_year',
)

<AxesSubplot: xlabel='log_citations_per_year', ylabel='Count'>


pairplot = sns.pairplot(
    df_train,
    x_vars = pm['numerical_variables'],
    y_vars = [ 'log_citations_per_year',],
    kind = 'hist',
    # plot_kws = { 'line_kws': { 'color': 'k', }, },
)
pairplot.axes[0,1].set_xlim( 0, 210 )
pairplot.axes[0,2].set_xlim( 0, 40 )

(0.0, 40.0)


sns.violinplot(
    df_train,
    x = 'journal_filtered',
    y = 'log_citations_per_year',
)

<AxesSubplot: xlabel='journal_filtered', ylabel='log_citations_per_year'>


from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, Normalizer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_validate, KFold


# Two things for our numerical variables:
# Imputation of missing values and scaling by mean and std
numerical_preprocessing = Pipeline(
    [
        ( 'impute', SimpleImputer( strategy='mean' ) ),
        ( 'scale', StandardScaler() ),
    ]
)


# Preprocessing for word vectors is just scaling
vector_preprocessing = Pipeline(
    [
        ( 'scale', Normalizer() ),
    ]
)


# There is a subset of the numerical variables that I refer to as "semantic".
# These variables contain metrics that measure the relationship of the words in a publication to words in other publications.
semantic_preprocessing = Pipeline(
    [
        ( 'scale', StandardScaler() ),
    ]
)


# Combine the numerical preprocessing with onehot encoding for the categorical variable
metadata_preprocessing = ColumnTransformer( [
        ( 'numerical', numerical_preprocessing, pm['numerical_variables'] ),
        ( 'onehot', OneHotEncoder(), pm['categorical_variables'] ),
] )


# For semantic and metadata combined
derived_preprocessing = ColumnTransformer( [
        ( 'numerical', numerical_preprocessing, pm['numerical_variables'] + pm['semantic_variables'] ),
        ( 'onehot', OneHotEncoder(), pm['categorical_variables'] ),
] )


# Set up a kfold object for cross validation
kfold = KFold(
    n_splits = 5,
    shuffle = True,
    random_state = 1532
)


# Object for storing data
crossvals = {}


from sklearn.base import BaseEstimator


class Baseline( BaseEstimator ):
    '''The baseline model is just the mean. We put it into a class
    for full consistency with all future models.'''
    
    def fit( self, X , y):
        
        self.estimate = y.mean()
        
    def predict( self, X ):
        
        return np.full( X.shape[0], self.estimate )

model = Baseline()


model

Baseline()

Baseline()


# Perform and store cross validation
crossvals['baseline'] = verdict.Dict( cross_validate(
    estimator = model,
    X = X_train_df.values,
    y = y_train,
    cv = kfold,
    scoring = 'neg_root_mean_squared_error',
    return_estimator = True,
) )


-crossvals['baseline']['test_score']

array([0.64482796, 0.66680128, 0.66007965, 0.63059422, 0.63430133])


sample_mean = y_train.mean()
sample_std = y_train.std()
sample_std

0.6472643986056894


from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression


model = Pipeline(
    [
        ( 'preprocessing', vector_preprocessing ),
        ( 'poly', PolynomialFeatures( degree=1 ) ),
        ( 'reg', LinearRegression( fit_intercept=False ) ),
    ]
)


model

Pipeline(steps=[('preprocessing', Pipeline(steps=[('scale', Normalizer())])),
                ('poly', PolynomialFeatures(degree=1)),
                ('reg', LinearRegression(fit_intercept=False))])

Pipeline(steps=[('preprocessing', Pipeline(steps=[('scale', Normalizer())])),
                ('poly', PolynomialFeatures(degree=1)),
                ('reg', LinearRegression(fit_intercept=False))])

Pipeline(steps=[('scale', Normalizer())])

Normalizer()

PolynomialFeatures(degree=1)

LinearRegression(fit_intercept=False)


# Perform and store cross validation
crossvals['linear_regression'] = verdict.Dict( cross_validate(
    estimator = model,
    X = v_train,
    y = y_train,
    cv = kfold,
    scoring = 'neg_root_mean_squared_error',
    return_estimator = True,
) )


import matplotlib
import matplotlib.pyplot as plt


def rmse_swarmplot( crossval, y_lim=(sample_std-0.15, sample_std+0.15), ax=None ):    
    '''I'll use this plot throughout for showing results.'''
    
    # Format data
    df_data = -1. * verdict.Dict( crossvals ).inner_item( 'test_score' )
    crossval_df = pd.DataFrame( df_data._storage ).melt( var_name='model', value_name='rmse' )
    
    # Visualize
    if ax is None:
        fig = plt.figure( figsize=(len(df_data)*1.5, 3) )
        ax = plt.gca()
    
    # Plot itself
    sns.swarmplot(
        data = crossval_df,
        x = 'model',
        y = 'rmse',
        hue = 'model',
        ax = ax,
        legend = None,
    )
    
    # Mark the analytic baseline value for comparison
    ax.axhline(
        sample_std,
        color = '0.5',
        linestyle = '--',
        linewidth = 0.75,
    )

    ax.set_xlabel( 'model' )
    ax.set_ylabel( r'RMSE in $\log_{10}$( citations / year )' )
    
    ax.set_ylim( y_lim )
    
    return ax.get_figure()


fig = rmse_swarmplot( crossvals, y_lim=(sample_std-0.5, sample_std+0.5) )


from sklearn.ensemble import RandomForestRegressor


model = Pipeline(
    [
        ( 'preprocessing', vector_preprocessing ),
        ( 'reg', RandomForestRegressor( max_depth=3, n_estimators=200 ) ),    ]
)


model

Pipeline(steps=[('preprocessing', Pipeline(steps=[('scale', Normalizer())])),
                ('reg', RandomForestRegressor(max_depth=3, n_estimators=200))])

Pipeline(steps=[('preprocessing', Pipeline(steps=[('scale', Normalizer())])),
                ('reg', RandomForestRegressor(max_depth=3, n_estimators=200))])

Pipeline(steps=[('scale', Normalizer())])

Normalizer()

RandomForestRegressor(max_depth=3, n_estimators=200)


# Perform and store cross validation
crossvals['random_forest'] = verdict.Dict( cross_validate(
    estimator = model,
    X = v_train,
    y = y_train,
    cv = kfold,
    scoring = 'neg_root_mean_squared_error',
    return_estimator = True,
) )


fig = rmse_swarmplot( crossvals, y_lim=(sample_std-0.5, sample_std+0.5) )


from sklearn.ensemble import GradientBoostingRegressor


model = Pipeline(
    [
        ( 'preprocessing', vector_preprocessing ),
        ( 'reg', GradientBoostingRegressor() ),
    ]
)


model

Pipeline(steps=[('preprocessing', Pipeline(steps=[('scale', Normalizer())])),
                ('reg', GradientBoostingRegressor())])

Pipeline(steps=[('preprocessing', Pipeline(steps=[('scale', Normalizer())])),
                ('reg', GradientBoostingRegressor())])

Pipeline(steps=[('scale', Normalizer())])

Normalizer()

GradientBoostingRegressor()


# Perform and store cross validation
crossvals['grad_boost'] = verdict.Dict( cross_validate(
    estimator = model,
    X = v_train,
    y = y_train,
    cv = kfold,
    scoring = 'neg_root_mean_squared_error',
    return_estimator = True,
) )


# Let's go ahead and drop the linear regression model so it's not distracting us.
del crossvals['linear_regression']


fig = rmse_swarmplot( crossvals, y_lim=(sample_std-0.2, sample_std+0.2) )


print( 'We can predict the number of citations to within a factor of ~{:.2g} on average.'.format( 10.**-crossvals['grad_boost']['test_score'].mean() ) )

We can predict the number of citations to within a factor of ~3.7 on average.


from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV


model = Pipeline(
    [
        ( 'preprocessing', vector_preprocessing ),
        ( 'knn', KNeighborsRegressor( n_neighbors=32 ) ),
    ]
)


model

Pipeline(steps=[('preprocessing', Pipeline(steps=[('scale', Normalizer())])),
                ('knn', KNeighborsRegressor(n_neighbors=32))])

Pipeline(steps=[('preprocessing', Pipeline(steps=[('scale', Normalizer())])),
                ('knn', KNeighborsRegressor(n_neighbors=32))])

Pipeline(steps=[('scale', Normalizer())])

Normalizer()

KNeighborsRegressor(n_neighbors=32)


# Perform a parameter search for the number of neighbors to use
param_grid = {
    'knn__n_neighbors': [ 4, 16, 32, 64, 128, 256, ],
}
search = GridSearchCV( model, param_grid, )
search.fit( v_train, df_train['log_citations_per_year'] )
model = search.best_estimator_
search.best_params_

{'knn__n_neighbors': 16}


# Perform and store cross validation
crossvals['KNN'] = verdict.Dict( cross_validate(
    estimator = model,
    X = v_train,
    y = y_train,
    cv = kfold,
    scoring = 'neg_root_mean_squared_error',
    return_estimator = True,
) )


fig = rmse_swarmplot( crossvals )


sns.histplot(
    df_train,
    x = 'density',
)

<AxesSubplot: xlabel='density', ylabel='Count'>


sns.histplot(
    df_train,
    x = 'asymmetry',
)

<AxesSubplot: xlabel='asymmetry', ylabel='Count'>


semantic_model = Pipeline(
    [
        ( 'preprocessing', semantic_preprocessing ),
        ( 'poly', PolynomialFeatures( degree=1 ) ),
        ( 'reg', LinearRegression( fit_intercept=False ) ),
    ]
)


semantic_model

Pipeline(steps=[('preprocessing',
                 Pipeline(steps=[('scale', StandardScaler())])),
                ('poly', PolynomialFeatures(degree=1)),
                ('reg', LinearRegression(fit_intercept=False))])

Pipeline(steps=[('preprocessing',
                 Pipeline(steps=[('scale', StandardScaler())])),
                ('poly', PolynomialFeatures(degree=1)),
                ('reg', LinearRegression(fit_intercept=False))])

Pipeline(steps=[('scale', StandardScaler())])

StandardScaler()

PolynomialFeatures(degree=1)

LinearRegression(fit_intercept=False)


# Perform and store cross validation
crossvals['semantic_features'] = verdict.Dict( cross_validate(
    estimator = semantic_model,
    X = M_train,
    y = y_train,
    cv = kfold,
    scoring = 'neg_root_mean_squared_error',
    return_estimator = True,
) )


fig = rmse_swarmplot( crossvals )


X_train_df.sample( 5 )


metadata_model = Pipeline(
    [
        ( 'preprocessing', metadata_preprocessing ),
        ( 'reg', GradientBoostingRegressor() ),
    ]
)


metadata_model

Pipeline(steps=[('preprocessing',
                 ColumnTransformer(transformers=[('numerical',
                                                  Pipeline(steps=[('impute',
                                                                   SimpleImputer()),
                                                                  ('scale',
                                                                   StandardScaler())]),
                                                  ['age', 'references_count',
                                                   'page_count',
                                                   'log_author_count']),
                                                 ('onehot', OneHotEncoder(),
                                                  ['journal_filtered'])])),
                ('reg', GradientBoostingRegressor())])

Pipeline(steps=[('preprocessing',
                 ColumnTransformer(transformers=[('numerical',
                                                  Pipeline(steps=[('impute',
                                                                   SimpleImputer()),
                                                                  ('scale',
                                                                   StandardScaler())]),
                                                  ['age', 'references_count',
                                                   'page_count',
                                                   'log_author_count']),
                                                 ('onehot', OneHotEncoder(),
                                                  ['journal_filtered'])])),
                ('reg', GradientBoostingRegressor())])

ColumnTransformer(transformers=[('numerical',
                                 Pipeline(steps=[('impute', SimpleImputer()),
                                                 ('scale', StandardScaler())]),
                                 ['age', 'references_count', 'page_count',
                                  'log_author_count']),
                                ('onehot', OneHotEncoder(),
                                 ['journal_filtered'])])

['age', 'references_count', 'page_count', 'log_author_count']

SimpleImputer()

StandardScaler()

['journal_filtered']

OneHotEncoder()

GradientBoostingRegressor()


# Perform and store cross validation
crossvals['metadata'] = verdict.Dict( cross_validate(
    estimator = metadata_model,
    X = X_train_df,
    y = y_train,
    cv = kfold,
    scoring = 'neg_root_mean_squared_error',
    return_estimator = True,
) )


fig = rmse_swarmplot( crossvals )


import keras

2023-06-24 18:25:09.341785: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


from sklearn.metrics import mean_squared_error


# Simple multilayer model
class NeuralNet( BaseEstimator ):
    '''The estimator for the neural net.
    We need to wrap it as such because the output of the keras predict function
    is not what the voting regressor expects.
    Also, the voting regressor expects estimator._estimator_type = 'regressor'.
    '''
    
    def __init__( self ):

        # Build the model itself
        neural_net = keras.models.Sequential()
        neural_net.add( keras.layers.Input( shape=(v.shape[1],) ) )
        neural_net.add( keras.layers.Dense( 16, activation='relu', ) )
        neural_net.add( keras.layers.Dense( 16, activation='relu', ) )
        neural_net.add( keras.layers.Dense( 16, activation='relu', ) )
        neural_net.add( keras.layers.Dense( 1, ) )
        neural_net.compile( loss='mean_squared_error' )
        
        self.neural_net = neural_net
        self._estimator_type = 'regressor'
    
    def fit( self, X, y, *args, **kwargs ):
        
        self.neural_net.fit( X, y, *args, **kwargs )
    
    def predict( self, X, *args, **kwargs ):
        
        raw_result = self.neural_net.predict( X, *args, **kwargs )
        
        return raw_result.transpose()[0]


# Actually build the model
model = Pipeline(
    [
        ( 'preprocessing', vector_preprocessing ),
        ( 'reg', NeuralNet() ),
    ]
)


model

Pipeline(steps=[('preprocessing', Pipeline(steps=[('scale', Normalizer())])),
                ('reg', NeuralNet())])

Pipeline(steps=[('preprocessing', Pipeline(steps=[('scale', Normalizer())])),
                ('reg', NeuralNet())])

Pipeline(steps=[('scale', Normalizer())])

Normalizer()

NeuralNet()


# Perform KFold cross validation
# We could use the exact same code as before, but this allows a little additional control.
crossvals['neural_net'] = {'test_score':[], 'estimator':[],}
for train_index, test_index in kfold.split(v_train):
        
    # Fit and predict
    model.fit( v_train[train_index].toarray(), y_train[train_index] )
    log_cpy_pred = model.predict( v_train[test_index].toarray() )
    
    # Compare prediction
    rmse = -np.sqrt( mean_squared_error( y_train[test_index], log_cpy_pred ) )
    crossvals['neural_net']['test_score'].append( rmse )
    crossvals['neural_net']['estimator'].append( model )
crossvals['neural_net']['test_score'] = np.array( crossvals['neural_net']['test_score'] )

148/148 [==============================] - 1s 4ms/step - loss: 0.3548
37/37 [==============================] - 0s 1ms/step
148/148 [==============================] - 1s 3ms/step - loss: 0.2664
37/37 [==============================] - 0s 1ms/step
148/148 [==============================] - 1s 4ms/step - loss: 0.2303
37/37 [==============================] - 0s 2ms/step
148/148 [==============================] - 1s 4ms/step - loss: 0.2141
37/37 [==============================] - 0s 1ms/step
148/148 [==============================] - 1s 4ms/step - loss: 0.1905
37/37 [==============================] - 0s 1ms/step


fig = rmse_swarmplot( crossvals, y_lim=(sample_std-0.3, sample_std+0.3 ) )


from sklearn.preprocessing import FunctionTransformer
from sklearn.ensemble import VotingRegressor


# Combine word vectors, semantic variables, and metadata into a single feature array
D_train = derived_preprocessing.fit_transform( D_train_df )
n_sample = D_train.shape[0]
n_D = D_train.shape[1]
n_v = v_train.shape[1]
A_train = np.zeros( shape=(n_sample, n_D + n_v) )
A_train[:,:n_D] = D_train
A_train[:,n_D:] = vector_preprocessing.fit_transform( v_train ).toarray()


# Build preprocessing objects to select either only the word vectors or the other features
select_word_vectors = FunctionTransformer( lambda A: A[:,n_D:] )
select_derived_features = FunctionTransformer( lambda A: A[:,:n_D] )


# Gradient boost for all the derived features (metadata and semantic)
derived_features_grad_boosting = Pipeline(
    [
        ( 'select_derived_features', select_derived_features ),
        ( 'reg', GradientBoostingRegressor() ),
    ]
)


# The most-successful intuitive word vector model
vector_knn = Pipeline(
    [
        ( 'select_word_vectors', select_word_vectors ),
        ( 'reg', KNeighborsRegressor( n_neighbors=search.best_params_['knn__n_neighbors'] ) ),
    ]
)


# The neural net    
vector_neural_net = Pipeline(
    [
        ( 'select_word_vectors', select_word_vectors ),
        ( 'reg', NeuralNet() ),
    ]
)


# Put it together!
model = VotingRegressor(
    estimators = [
        ( 'vector_knn', vector_knn ),
        ( 'derived_features_grad_boosting', derived_features_grad_boosting ),
        ( 'vector_neural_net', vector_neural_net ),
    ],
)


model

VotingRegressor(estimators=[('vector_knn',
                             Pipeline(steps=[('select_word_vectors',
                                              FunctionTransformer(func=<function <lambda> at 0x7fabccffd6c0>)),
                                             ('reg',
                                              KNeighborsRegressor(n_neighbors=16))])),
                            ('derived_features_grad_boosting',
                             Pipeline(steps=[('select_derived_features',
                                              FunctionTransformer(func=<function <lambda> at 0x7fabccffd7e0>)),
                                             ('reg',
                                              GradientBoostingRegressor())])),
                            ('vector_neural_net',
                             Pipeline(steps=[('select_word_vectors',
                                              FunctionTransformer(func=<function <lambda> at 0x7fabccffd6c0>)),
                                             ('reg', NeuralNet())]))])

VotingRegressor(estimators=[('vector_knn',
                             Pipeline(steps=[('select_word_vectors',
                                              FunctionTransformer(func=<function <lambda> at 0x7fabccffd6c0>)),
                                             ('reg',
                                              KNeighborsRegressor(n_neighbors=16))])),
                            ('derived_features_grad_boosting',
                             Pipeline(steps=[('select_derived_features',
                                              FunctionTransformer(func=<function <lambda> at 0x7fabccffd7e0>)),
                                             ('reg',
                                              GradientBoostingRegressor())])),
                            ('vector_neural_net',
                             Pipeline(steps=[('select_word_vectors',
                                              FunctionTransformer(func=<function <lambda> at 0x7fabccffd6c0>)),
                                             ('reg', NeuralNet())]))])

FunctionTransformer(func=<function <lambda> at 0x7fabccffd6c0>)

KNeighborsRegressor(n_neighbors=16)

FunctionTransformer(func=<function <lambda> at 0x7fabccffd7e0>)

GradientBoostingRegressor()

FunctionTransformer(func=<function <lambda> at 0x7fabccffd6c0>)

NeuralNet()


# Perform KFold cross validation
crossvals['voting'] = {'test_score':[], 'estimator':[],}
for train_index, test_index in kfold.split(A_train):
        
    # Fit and predict
    model.fit( A_train[train_index], y_train[train_index] )
    log_cpy_pred = model.predict( A_train[test_index] )
    
    # Compare prediction
    rmse = -np.sqrt( mean_squared_error( y_train[test_index], log_cpy_pred ) )
    crossvals['voting']['test_score'].append( rmse )
    crossvals['voting']['estimator'].append( model )
crossvals['voting']['test_score'] = np.array( crossvals['voting']['test_score'] )

148/148 [==============================] - 1s 4ms/step - loss: 0.3664
37/37 [==============================] - 0s 1ms/step
148/148 [==============================] - 1s 3ms/step - loss: 0.3680
37/37 [==============================] - 0s 1ms/step
148/148 [==============================] - 1s 3ms/step - loss: 0.3682
37/37 [==============================] - 0s 1ms/step
148/148 [==============================] - 1s 5ms/step - loss: 0.3951
37/37 [==============================] - 0s 1ms/step
148/148 [==============================] - 1s 4ms/step - loss: 0.3768
37/37 [==============================] - 0s 1ms/step


fig = rmse_swarmplot( crossvals, y_lim=(sample_std-0.3, sample_std+0.3 ) )


# Combine word vectors, semantic variables, and metadata into a single feature array
D_test = derived_preprocessing.fit_transform( D_test_df )
n_sample = D_test.shape[0]
n_D = D_test.shape[1]
n_v = v_test.shape[1]
A_test = np.zeros( shape=(n_sample, n_D + n_v) )
A_test[:,:n_D] = D_test
A_test[:,n_D:] = vector_preprocessing.fit_transform( v_test ).toarray()


# Fit to all training, and test!
ys = {
    'actual': y_test,
}
rmses = {}
for model_name, item in tqdm.tqdm( crossvals.items() ):
    
    # Retrieve the model
    model = item['estimator'][0]
    
    # Fit and predict
    if model_name in [ 'baseline', 'random_forest', 'grad_boost', 'KNN', 'neural_net' ]:
        model.fit( v_train, y_train )
        y_pred = model.predict( v_test )
    elif model_name == 'semantic_features':
        model.fit( M_train, y_train )
        y_pred = model.predict( M_test )
    elif model_name == 'metadata':
        model.fit( X_train_df, y_train )
        y_pred = model.predict( X_test_df )
    elif model_name == 'voting':
        model.fit( A_train, y_train )
        y_pred = model.predict( A_test )
    else:
        raise KeyError( 'Unknown model name, {}'.format( model_name ) )
        
    ys[model_name] = y_pred
    rmses[model_name] = np.sqrt( mean_squared_error( y_test, y_pred ) )

 75%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                           | 6/8 [00:48<00:11,  5.68s/it]2023-06-24 18:26:51.456646: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_1' with dtype float and shape [5887]
	 [[{{node Placeholder/_1}}]]
2023-06-24 18:26:51.456891: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_1' with dtype float and shape [5887]
	 [[{{node Placeholder/_1}}]]

184/184 [==============================] - 1s 2ms/step - loss: 0.1646
46/46 [==============================] - 0s 745us/step

2023-06-24 18:26:52.112521: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype variant and shape [1472,3]
	 [[{{node Placeholder/_0}}]]
 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                             | 7/8 [00:49<00:04,  4.38s/it]

184/184 [==============================] - 1s 3ms/step - loss: 0.5087
46/46 [==============================] - 0s 1ms/step

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:57<00:00,  7.13s/it]


# Calculate what fraction of estimates we get correct.
# I will print this later.
cpy_df = pd.DataFrame( ys ).apply( lambda x: 10.**x )
cpy_diff = np.abs( cpy_df['actual'] - cpy_df['voting'] )
dist, bins = np.histogram(
    cpy_diff,
    bins = np.arange( 0, cpy_diff.max()+1, 1 ),
)
cdist = np.cumsum( dist )
pdist = cdist / cdist[-1] * 100


fig = plt.figure( figsize=(len(rmses)*1.5, 3) )
ax = plt.gca()

rmse_df = pd.DataFrame({
    'model_name': list( rmses.keys() ),
    'rmse': list( rmses.values() ),
})

sns.swarmplot(
    rmse_df,
    y = 'rmse',
    x = 'model_name',
    color = 'k',
    marker = 'X',
    s = 10,
    edgecolor = 'k',
    ax = ax,
    legend = None,
    zorder = 10,
)

fig = rmse_swarmplot( crossvals, ax=ax, y_lim=( sample_std-0.3, sample_std+0.3) )


print( (
    'The best model can predict the number of citations per year to within a factor of ~{:.2g},\n'.format( 10.**rmse_df['rmse'].min() ) +
    'compared to a factor of ~{:.2g} for the baseline.'.format( 10.**rmses['baseline'] )
) )

The best model can predict the number of citations per year to within a factor of ~3.4,
compared to a factor of ~4.4 for the baseline.


print( (
    'The best model estimates the citation count to within {:.2g} citations per year for {:.2g}% of publications.'.format( bins[3], pdist[3] )
) )

The best model estimates the citation count to within 3 citations per year for 76% of publications.

	age	citations_per_year	converged_kernel_size	density	asymmetry	kernel_constant_asymmetry	smoothing_length	projection_ind	log_density	log_citations_per_year	...	journal	title	abstract_character_count	entry_date	ind	page_count	author_count	log_author_count	title_character_count	journal_filtered
2013ApJ...766...95L	10.160942	14.467185	86	13.139502	0.683904	10.942457	1.217702	8119	1.118579	1.160384	...	\apj	{Orbital Phase Variations of the Eccentric Gia...	2135	2013-03-24 00:00:00+00:00	3625	NaN	22	1.342423	65	\apj
2001AJ....122...38T	21.863567	1.646576	53	13.554672	0.730760	11.692153	1.180405	995	1.132089	0.216582	...	\aj	{Quantitative Morphology of the Intermediate-R...	1051	2001-07-13 00:00:00+00:00	441	16.0	4	0.602060	182	\aj
2016ApJ...826...87W	6.805203	13.519099	30	14.297511	0.690165	11.042633	1.119076	20206	1.155260	1.130948	...	\apj	{The Soft State of Cygnus X-1 Observed with Nu...	1776	2016-07-31 00:00:00+00:00	6651	NaN	20	1.301030	94	\apj
2020A&A...633A..69H	3.334410	86.672399	63	14.019318	0.656591	10.505457	1.141282	23361	1.146727	1.937881	...	\aap	{KiDS+VIKING-450: Cosmic shear tomography with...	2022	2020-01-19 00:00:00+00:00	7137	NaN	28	1.447158	73	\aap
2007A&A...461..931P	16.376592	2.198262	18	14.810286	0.644750	10.315996	1.080330	22584	1.170563	0.342079	...	\aap	{XMM-Newton observation of the NLS1 galaxy Ark...	1948	2007-01-06 00:00:00+00:00	7030	11.0	5	0.698970	102	\aap

	age	references_count	page_count	log_author_count	journal_filtered
2004MNRAS.355..591I	18.469477	28	9.0	0.602060	\mnras
2011arXiv1101.2362Z	12.355184	9	NaN	0.301030	other
2021FrP.....9..709P	0.315615	188	NaN	0.000000	other
2005ApJ...628..160C	17.822983	53	8.0	0.778151	\apj
2000MNRAS.318..938M	22.548411	24	4.0	0.301030	\mnras

Predicting Citation Count with Machine Learning ¶

Table of Contents¶

Data ¶

User-Defined Parameters¶

Load raw data¶

Format into a DataFrame¶

Derive or retrieve additional quantities¶

ADS metadata¶

Index¶

Page count¶

Author count¶

Title character count¶

Journals, filtered¶

Word vectors¶

Summarize data¶

Split testing and training data¶

Exploratory Data Analysis ¶

Citations per year¶

Numerical variables¶

Categorical variable¶

Modeling ¶

Preprocessing ¶

A Baseline Model ¶

A Naive Model: Simple Linear Regression ¶

A Basic Phenomenological Model: Random Forest ¶

A Better Phenomoenological Model: Gradient Boosting ¶

An Informed Model: K Nearest Neighbors ¶

Another Informed Model: Density and Asymmetry ¶

A Model Utilizing Metadata ¶

A Neural Net ¶

Putting it All Together: Voting Regression ¶

Final Validation ¶

Takeaways ¶

Cautions¶

Credits¶