| Wong Hoi Sing Edison | 7c106d7 | 2006-06-05 17:27:58 -0700 | [diff] [blame] | 1 | /* | 
|  | 2 | * TCP Low Priority (TCP-LP) | 
|  | 3 | * | 
|  | 4 | * TCP Low Priority is a distributed algorithm whose goal is to utilize only | 
|  | 5 | *   the excess network bandwidth as compared to the ``fair share`` of | 
| Wong Hoi Sing Edison | 3795da4 | 2006-09-13 20:30:30 -0700 | [diff] [blame] | 6 | *   bandwidth as targeted by TCP. | 
| Wong Hoi Sing Edison | 7c106d7 | 2006-06-05 17:27:58 -0700 | [diff] [blame] | 7 | * | 
| Wong Hoi Sing Edison | 7c106d7 | 2006-06-05 17:27:58 -0700 | [diff] [blame] | 8 | * As of 2.6.13, Linux supports pluggable congestion control algorithms. | 
|  | 9 | * Due to the limitation of the API, we take the following changes from | 
|  | 10 | * the original TCP-LP implementation: | 
|  | 11 | *   o We use newReno in most core CA handling. Only add some checking | 
|  | 12 | *     within cong_avoid. | 
|  | 13 | *   o Error correcting in remote HZ, therefore remote HZ will be keeped | 
|  | 14 | *     on checking and updating. | 
|  | 15 | *   o Handling calculation of One-Way-Delay (OWD) within rtt_sample, sicne | 
|  | 16 | *     OWD have a similar meaning as RTT. Also correct the buggy formular. | 
|  | 17 | *   o Handle reaction for Early Congestion Indication (ECI) within | 
|  | 18 | *     pkts_acked, as mentioned within pseudo code. | 
|  | 19 | *   o OWD is handled in relative format, where local time stamp will in | 
|  | 20 | *     tcp_time_stamp format. | 
|  | 21 | * | 
| Wong Hoi Sing Edison | 3795da4 | 2006-09-13 20:30:30 -0700 | [diff] [blame] | 22 | * Original Author: | 
|  | 23 | *   Aleksandar Kuzmanovic <akuzma@northwestern.edu> | 
|  | 24 | * Available from: | 
|  | 25 | *   http://www.ece.rice.edu/~akuzma/Doc/akuzma/TCP-LP.pdf | 
|  | 26 | * Original implementation for 2.4.19: | 
|  | 27 | *   http://www-ece.rice.edu/networks/TCP-LP/ | 
| Wong Hoi Sing Edison | 7c106d7 | 2006-06-05 17:27:58 -0700 | [diff] [blame] | 28 | * | 
| Wong Hoi Sing Edison | 3795da4 | 2006-09-13 20:30:30 -0700 | [diff] [blame] | 29 | * 2.6.x module Authors: | 
|  | 30 | *   Wong Hoi Sing, Edison <hswong3i@gmail.com> | 
|  | 31 | *   Hung Hing Lun, Mike <hlhung3i@gmail.com> | 
|  | 32 | * SourceForge project page: | 
|  | 33 | *   http://tcp-lp-mod.sourceforge.net/ | 
| Wong Hoi Sing Edison | 7c106d7 | 2006-06-05 17:27:58 -0700 | [diff] [blame] | 34 | */ | 
|  | 35 |  | 
| Wong Hoi Sing Edison | 7c106d7 | 2006-06-05 17:27:58 -0700 | [diff] [blame] | 36 | #include <linux/module.h> | 
|  | 37 | #include <net/tcp.h> | 
|  | 38 |  | 
|  | 39 | /* resolution of owd */ | 
|  | 40 | #define LP_RESOL       1000 | 
|  | 41 |  | 
|  | 42 | /** | 
|  | 43 | * enum tcp_lp_state | 
|  | 44 | * @LP_VALID_RHZ: is remote HZ valid? | 
|  | 45 | * @LP_VALID_OWD: is OWD valid? | 
|  | 46 | * @LP_WITHIN_THR: are we within threshold? | 
|  | 47 | * @LP_WITHIN_INF: are we within inference? | 
|  | 48 | * | 
|  | 49 | * TCP-LP's state flags. | 
|  | 50 | * We create this set of state flag mainly for debugging. | 
|  | 51 | */ | 
|  | 52 | enum tcp_lp_state { | 
|  | 53 | LP_VALID_RHZ = (1 << 0), | 
|  | 54 | LP_VALID_OWD = (1 << 1), | 
|  | 55 | LP_WITHIN_THR = (1 << 3), | 
|  | 56 | LP_WITHIN_INF = (1 << 4), | 
|  | 57 | }; | 
|  | 58 |  | 
|  | 59 | /** | 
|  | 60 | * struct lp | 
|  | 61 | * @flag: TCP-LP state flag | 
|  | 62 | * @sowd: smoothed OWD << 3 | 
|  | 63 | * @owd_min: min OWD | 
|  | 64 | * @owd_max: max OWD | 
|  | 65 | * @owd_max_rsv: resrved max owd | 
|  | 66 | * @remote_hz: estimated remote HZ | 
|  | 67 | * @remote_ref_time: remote reference time | 
|  | 68 | * @local_ref_time: local reference time | 
|  | 69 | * @last_drop: time for last active drop | 
|  | 70 | * @inference: current inference | 
|  | 71 | * | 
|  | 72 | * TCP-LP's private struct. | 
|  | 73 | * We get the idea from original TCP-LP implementation where only left those we | 
|  | 74 | * found are really useful. | 
|  | 75 | */ | 
|  | 76 | struct lp { | 
|  | 77 | u32 flag; | 
|  | 78 | u32 sowd; | 
|  | 79 | u32 owd_min; | 
|  | 80 | u32 owd_max; | 
|  | 81 | u32 owd_max_rsv; | 
|  | 82 | u32 remote_hz; | 
|  | 83 | u32 remote_ref_time; | 
|  | 84 | u32 local_ref_time; | 
|  | 85 | u32 last_drop; | 
|  | 86 | u32 inference; | 
|  | 87 | }; | 
|  | 88 |  | 
|  | 89 | /** | 
|  | 90 | * tcp_lp_init | 
|  | 91 | * | 
|  | 92 | * Init all required variables. | 
|  | 93 | * Clone the handling from Vegas module implementation. | 
|  | 94 | */ | 
|  | 95 | static void tcp_lp_init(struct sock *sk) | 
|  | 96 | { | 
|  | 97 | struct lp *lp = inet_csk_ca(sk); | 
|  | 98 |  | 
|  | 99 | lp->flag = 0; | 
|  | 100 | lp->sowd = 0; | 
|  | 101 | lp->owd_min = 0xffffffff; | 
|  | 102 | lp->owd_max = 0; | 
|  | 103 | lp->owd_max_rsv = 0; | 
|  | 104 | lp->remote_hz = 0; | 
|  | 105 | lp->remote_ref_time = 0; | 
|  | 106 | lp->local_ref_time = 0; | 
|  | 107 | lp->last_drop = 0; | 
|  | 108 | lp->inference = 0; | 
|  | 109 | } | 
|  | 110 |  | 
|  | 111 | /** | 
|  | 112 | * tcp_lp_cong_avoid | 
|  | 113 | * | 
|  | 114 | * Implementation of cong_avoid. | 
|  | 115 | * Will only call newReno CA when away from inference. | 
|  | 116 | * From TCP-LP's paper, this will be handled in additive increasement. | 
|  | 117 | */ | 
|  | 118 | static void tcp_lp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight, | 
|  | 119 | int flag) | 
|  | 120 | { | 
|  | 121 | struct lp *lp = inet_csk_ca(sk); | 
|  | 122 |  | 
|  | 123 | if (!(lp->flag & LP_WITHIN_INF)) | 
|  | 124 | tcp_reno_cong_avoid(sk, ack, rtt, in_flight, flag); | 
|  | 125 | } | 
|  | 126 |  | 
|  | 127 | /** | 
|  | 128 | * tcp_lp_remote_hz_estimator | 
|  | 129 | * | 
|  | 130 | * Estimate remote HZ. | 
|  | 131 | * We keep on updating the estimated value, where original TCP-LP | 
|  | 132 | * implementation only guest it for once and use forever. | 
|  | 133 | */ | 
|  | 134 | static u32 tcp_lp_remote_hz_estimator(struct sock *sk) | 
|  | 135 | { | 
|  | 136 | struct tcp_sock *tp = tcp_sk(sk); | 
|  | 137 | struct lp *lp = inet_csk_ca(sk); | 
|  | 138 | s64 rhz = lp->remote_hz << 6;	/* remote HZ << 6 */ | 
|  | 139 | s64 m = 0; | 
|  | 140 |  | 
|  | 141 | /* not yet record reference time | 
|  | 142 | * go away!! record it before come back!! */ | 
|  | 143 | if (lp->remote_ref_time == 0 || lp->local_ref_time == 0) | 
|  | 144 | goto out; | 
|  | 145 |  | 
|  | 146 | /* we can't calc remote HZ with no different!! */ | 
|  | 147 | if (tp->rx_opt.rcv_tsval == lp->remote_ref_time | 
|  | 148 | || tp->rx_opt.rcv_tsecr == lp->local_ref_time) | 
|  | 149 | goto out; | 
|  | 150 |  | 
|  | 151 | m = HZ * (tp->rx_opt.rcv_tsval - | 
|  | 152 | lp->remote_ref_time) / (tp->rx_opt.rcv_tsecr - | 
|  | 153 | lp->local_ref_time); | 
|  | 154 | if (m < 0) | 
|  | 155 | m = -m; | 
|  | 156 |  | 
| Wong Hoi Sing Edison | 3795da4 | 2006-09-13 20:30:30 -0700 | [diff] [blame] | 157 | if (rhz > 0) { | 
| Wong Hoi Sing Edison | 7c106d7 | 2006-06-05 17:27:58 -0700 | [diff] [blame] | 158 | m -= rhz >> 6;	/* m is now error in remote HZ est */ | 
|  | 159 | rhz += m;	/* 63/64 old + 1/64 new */ | 
|  | 160 | } else | 
|  | 161 | rhz = m << 6; | 
|  | 162 |  | 
| Wong Hoi Sing Edison | 7c106d7 | 2006-06-05 17:27:58 -0700 | [diff] [blame] | 163 | out: | 
| Wong Hoi Sing Edison | 3795da4 | 2006-09-13 20:30:30 -0700 | [diff] [blame] | 164 | /* record time for successful remote HZ calc */ | 
| Wong Hoi Sing Edison | bfbea8a | 2006-09-28 14:40:21 -0700 | [diff] [blame] | 165 | if ((rhz >> 6) > 0) | 
| Wong Hoi Sing Edison | 3795da4 | 2006-09-13 20:30:30 -0700 | [diff] [blame] | 166 | lp->flag |= LP_VALID_RHZ; | 
|  | 167 | else | 
|  | 168 | lp->flag &= ~LP_VALID_RHZ; | 
|  | 169 |  | 
| Wong Hoi Sing Edison | 7c106d7 | 2006-06-05 17:27:58 -0700 | [diff] [blame] | 170 | /* record reference time stamp */ | 
|  | 171 | lp->remote_ref_time = tp->rx_opt.rcv_tsval; | 
|  | 172 | lp->local_ref_time = tp->rx_opt.rcv_tsecr; | 
|  | 173 |  | 
|  | 174 | return rhz >> 6; | 
|  | 175 | } | 
|  | 176 |  | 
|  | 177 | /** | 
|  | 178 | * tcp_lp_owd_calculator | 
|  | 179 | * | 
|  | 180 | * Calculate one way delay (in relative format). | 
|  | 181 | * Original implement OWD as minus of remote time difference to local time | 
|  | 182 | * difference directly. As this time difference just simply equal to RTT, when | 
|  | 183 | * the network status is stable, remote RTT will equal to local RTT, and result | 
|  | 184 | * OWD into zero. | 
|  | 185 | * It seems to be a bug and so we fixed it. | 
|  | 186 | */ | 
|  | 187 | static u32 tcp_lp_owd_calculator(struct sock *sk) | 
|  | 188 | { | 
|  | 189 | struct tcp_sock *tp = tcp_sk(sk); | 
|  | 190 | struct lp *lp = inet_csk_ca(sk); | 
|  | 191 | s64 owd = 0; | 
|  | 192 |  | 
|  | 193 | lp->remote_hz = tcp_lp_remote_hz_estimator(sk); | 
|  | 194 |  | 
|  | 195 | if (lp->flag & LP_VALID_RHZ) { | 
|  | 196 | owd = | 
|  | 197 | tp->rx_opt.rcv_tsval * (LP_RESOL / lp->remote_hz) - | 
|  | 198 | tp->rx_opt.rcv_tsecr * (LP_RESOL / HZ); | 
|  | 199 | if (owd < 0) | 
|  | 200 | owd = -owd; | 
|  | 201 | } | 
|  | 202 |  | 
|  | 203 | if (owd > 0) | 
|  | 204 | lp->flag |= LP_VALID_OWD; | 
|  | 205 | else | 
|  | 206 | lp->flag &= ~LP_VALID_OWD; | 
|  | 207 |  | 
|  | 208 | return owd; | 
|  | 209 | } | 
|  | 210 |  | 
|  | 211 | /** | 
|  | 212 | * tcp_lp_rtt_sample | 
|  | 213 | * | 
|  | 214 | * Implementation or rtt_sample. | 
|  | 215 | * Will take the following action, | 
|  | 216 | *   1. calc OWD, | 
|  | 217 | *   2. record the min/max OWD, | 
|  | 218 | *   3. calc smoothed OWD (SOWD). | 
|  | 219 | * Most ideas come from the original TCP-LP implementation. | 
|  | 220 | */ | 
|  | 221 | static void tcp_lp_rtt_sample(struct sock *sk, u32 usrtt) | 
|  | 222 | { | 
|  | 223 | struct lp *lp = inet_csk_ca(sk); | 
|  | 224 | s64 mowd = tcp_lp_owd_calculator(sk); | 
|  | 225 |  | 
|  | 226 | /* sorry that we don't have valid data */ | 
|  | 227 | if (!(lp->flag & LP_VALID_RHZ) || !(lp->flag & LP_VALID_OWD)) | 
|  | 228 | return; | 
|  | 229 |  | 
|  | 230 | /* record the next min owd */ | 
|  | 231 | if (mowd < lp->owd_min) | 
|  | 232 | lp->owd_min = mowd; | 
|  | 233 |  | 
|  | 234 | /* always forget the max of the max | 
|  | 235 | * we just set owd_max as one below it */ | 
|  | 236 | if (mowd > lp->owd_max) { | 
|  | 237 | if (mowd > lp->owd_max_rsv) { | 
|  | 238 | if (lp->owd_max_rsv == 0) | 
|  | 239 | lp->owd_max = mowd; | 
|  | 240 | else | 
|  | 241 | lp->owd_max = lp->owd_max_rsv; | 
|  | 242 | lp->owd_max_rsv = mowd; | 
|  | 243 | } else | 
|  | 244 | lp->owd_max = mowd; | 
|  | 245 | } | 
|  | 246 |  | 
|  | 247 | /* calc for smoothed owd */ | 
|  | 248 | if (lp->sowd != 0) { | 
|  | 249 | mowd -= lp->sowd >> 3;	/* m is now error in owd est */ | 
|  | 250 | lp->sowd += mowd;	/* owd = 7/8 owd + 1/8 new */ | 
|  | 251 | } else | 
|  | 252 | lp->sowd = mowd << 3;	/* take the measured time be owd */ | 
|  | 253 | } | 
|  | 254 |  | 
|  | 255 | /** | 
|  | 256 | * tcp_lp_pkts_acked | 
|  | 257 | * | 
|  | 258 | * Implementation of pkts_acked. | 
|  | 259 | * Deal with active drop under Early Congestion Indication. | 
|  | 260 | * Only drop to half and 1 will be handle, because we hope to use back | 
|  | 261 | * newReno in increase case. | 
|  | 262 | * We work it out by following the idea from TCP-LP's paper directly | 
|  | 263 | */ | 
|  | 264 | static void tcp_lp_pkts_acked(struct sock *sk, u32 num_acked) | 
|  | 265 | { | 
|  | 266 | struct tcp_sock *tp = tcp_sk(sk); | 
|  | 267 | struct lp *lp = inet_csk_ca(sk); | 
|  | 268 |  | 
|  | 269 | /* calc inference */ | 
|  | 270 | if (tcp_time_stamp > tp->rx_opt.rcv_tsecr) | 
|  | 271 | lp->inference = 3 * (tcp_time_stamp - tp->rx_opt.rcv_tsecr); | 
|  | 272 |  | 
|  | 273 | /* test if within inference */ | 
|  | 274 | if (lp->last_drop && (tcp_time_stamp - lp->last_drop < lp->inference)) | 
|  | 275 | lp->flag |= LP_WITHIN_INF; | 
|  | 276 | else | 
|  | 277 | lp->flag &= ~LP_WITHIN_INF; | 
|  | 278 |  | 
|  | 279 | /* test if within threshold */ | 
|  | 280 | if (lp->sowd >> 3 < | 
|  | 281 | lp->owd_min + 15 * (lp->owd_max - lp->owd_min) / 100) | 
|  | 282 | lp->flag |= LP_WITHIN_THR; | 
|  | 283 | else | 
|  | 284 | lp->flag &= ~LP_WITHIN_THR; | 
|  | 285 |  | 
|  | 286 | pr_debug("TCP-LP: %05o|%5u|%5u|%15u|%15u|%15u\n", lp->flag, | 
|  | 287 | tp->snd_cwnd, lp->remote_hz, lp->owd_min, lp->owd_max, | 
|  | 288 | lp->sowd >> 3); | 
|  | 289 |  | 
|  | 290 | if (lp->flag & LP_WITHIN_THR) | 
|  | 291 | return; | 
|  | 292 |  | 
|  | 293 | /* FIXME: try to reset owd_min and owd_max here | 
|  | 294 | * so decrease the chance the min/max is no longer suitable | 
|  | 295 | * and will usually within threshold when whithin inference */ | 
|  | 296 | lp->owd_min = lp->sowd >> 3; | 
|  | 297 | lp->owd_max = lp->sowd >> 2; | 
|  | 298 | lp->owd_max_rsv = lp->sowd >> 2; | 
|  | 299 |  | 
|  | 300 | /* happened within inference | 
|  | 301 | * drop snd_cwnd into 1 */ | 
|  | 302 | if (lp->flag & LP_WITHIN_INF) | 
|  | 303 | tp->snd_cwnd = 1U; | 
|  | 304 |  | 
|  | 305 | /* happened after inference | 
|  | 306 | * cut snd_cwnd into half */ | 
|  | 307 | else | 
|  | 308 | tp->snd_cwnd = max(tp->snd_cwnd >> 1U, 1U); | 
|  | 309 |  | 
|  | 310 | /* record this drop time */ | 
|  | 311 | lp->last_drop = tcp_time_stamp; | 
|  | 312 | } | 
|  | 313 |  | 
|  | 314 | static struct tcp_congestion_ops tcp_lp = { | 
|  | 315 | .init = tcp_lp_init, | 
|  | 316 | .ssthresh = tcp_reno_ssthresh, | 
|  | 317 | .cong_avoid = tcp_lp_cong_avoid, | 
|  | 318 | .min_cwnd = tcp_reno_min_cwnd, | 
|  | 319 | .rtt_sample = tcp_lp_rtt_sample, | 
|  | 320 | .pkts_acked = tcp_lp_pkts_acked, | 
|  | 321 |  | 
|  | 322 | .owner = THIS_MODULE, | 
|  | 323 | .name = "lp" | 
|  | 324 | }; | 
|  | 325 |  | 
|  | 326 | static int __init tcp_lp_register(void) | 
|  | 327 | { | 
| Alexey Dobriyan | 2989697 | 2006-08-25 00:37:24 -0700 | [diff] [blame] | 328 | BUILD_BUG_ON(sizeof(struct lp) > ICSK_CA_PRIV_SIZE); | 
| Wong Hoi Sing Edison | 7c106d7 | 2006-06-05 17:27:58 -0700 | [diff] [blame] | 329 | return tcp_register_congestion_control(&tcp_lp); | 
|  | 330 | } | 
|  | 331 |  | 
|  | 332 | static void __exit tcp_lp_unregister(void) | 
|  | 333 | { | 
|  | 334 | tcp_unregister_congestion_control(&tcp_lp); | 
|  | 335 | } | 
|  | 336 |  | 
|  | 337 | module_init(tcp_lp_register); | 
|  | 338 | module_exit(tcp_lp_unregister); | 
|  | 339 |  | 
| Wong Hoi Sing Edison | 3795da4 | 2006-09-13 20:30:30 -0700 | [diff] [blame] | 340 | MODULE_AUTHOR("Wong Hoi Sing Edison, Hung Hing Lun Mike"); | 
| Wong Hoi Sing Edison | 7c106d7 | 2006-06-05 17:27:58 -0700 | [diff] [blame] | 341 | MODULE_LICENSE("GPL"); | 
|  | 342 | MODULE_DESCRIPTION("TCP Low Priority"); |